Implemented locally connected 1d

Riccardo231 · Riccardo231 · commit b64038a4bcf0 · 2025-02-23T17:43:38.000+01:00
diff --git a/example/cnn_mnist_1d.f90 b/example/cnn_mnist_1d.f90
@@ -21,9 +21,9 @@ program cnn_mnist_1d
     net = network([ &
       input(784), &
       reshape2d([28,28]), &
-      conv1d(filters=8, kernel_size=3, activation=relu()), &
+      locally_connected_1d(filters=8, kernel_size=3, activation=relu()), &
       maxpool1d(pool_size=2), &
-      conv1d(filters=16, kernel_size=3, activation=relu()), &
+      locally_connected_1d(filters=16, kernel_size=3, activation=relu()), &
       maxpool1d(pool_size=2), &
       dense(10, activation=softmax()) &
     ])
diff --git a/src/nf/nf_locally_connected_1d.f90 b/src/nf/nf_locally_connected_1d.f90
@@ -16,13 +16,13 @@ module nf_locally_connected_1d_layer
       integer :: kernel_size
       integer :: filters
   
-      real, allocatable :: biases(:) ! size(filters)
-      real, allocatable :: kernel(:,:,:) ! filters x channels x window x window
+      real, allocatable :: biases(:,:) ! size(filters)
+      real, allocatable :: kernel(:,:,:,:) ! filters x channels x window x window
       real, allocatable :: output(:,:) ! filters x output_width * output_height
       real, allocatable :: z(:,:) ! kernel .dot. input + bias
   
-      real, allocatable :: dw(:,:,:) ! weight (kernel) gradients
-      real, allocatable :: db(:) ! bias gradients
+      real, allocatable :: dw(:,:,:,:) ! weight (kernel) gradients
+      real, allocatable :: db(:,:) ! bias gradients
       real, allocatable :: gradient(:,:)
   
       class(activation_function), allocatable :: activation
diff --git a/src/nf/nf_locally_connected_1d_submodule.f90 b/src/nf/nf_locally_connected_1d_submodule.f90
@@ -17,7 +17,7 @@ module function locally_connected_1d_layer_cons(filters, kernel_size, activation
     res % kernel_size = kernel_size
     res % filters = filters
     res % activation_name = activation % get_name()
-    allocate( res % activation, source = activation )
+    allocate(res % activation, source = activation)
   end function locally_connected_1d_layer_cons
 
   module subroutine init(self, input_shape)
@@ -28,16 +28,14 @@ module subroutine init(self, input_shape)
     self % channels = input_shape(1)
     self % width = input_shape(2) - self % kernel_size + 1
 
-    ! Output of shape: filters x width
     allocate(self % output(self % filters, self % width))
     self % output = 0
 
-    ! Kernel of shape: filters x channels x kernel_size
-    allocate(self % kernel(self % filters, self % channels, self % kernel_size))
+    allocate(self % kernel(self % filters, self % width, self % channels, self % kernel_size))
     call random_normal(self % kernel)
     self % kernel = self % kernel / real(self % kernel_size**2)
 
-    allocate(self % biases(self % filters))
+    allocate(self % biases(self % filters, self % width))
     self % biases = 0
 
     allocate(self % z, mold=self % output)
@@ -51,7 +49,6 @@ module subroutine init(self, input_shape)
 
     allocate(self % db, mold=self % biases)
     self % db = 0
-
   end subroutine init
 
   pure module subroutine forward(self, input)
@@ -60,113 +57,81 @@ pure module subroutine forward(self, input)
     real, intent(in) :: input(:,:)
     integer :: input_channels, input_width
     integer :: j, n
-    integer :: iws, iwe, half_window
+    integer :: iws, iwe
 
     input_channels = size(input, dim=1)
     input_width    = size(input, dim=2)
-    half_window = self % kernel_size / 2
 
-    ! Loop over output positions.
     do j = 1, self % width
-      ! Compute the input window corresponding to output index j.
-      ! In forward: center index = j + half_window, so window = indices j to j+kernel_size-1.
       iws = j
       iwe = j + self % kernel_size - 1
-
-      ! For each filter, compute the convolution (inner product over channels and kernel width).
       do concurrent (n = 1:self % filters)
-        self % z(n, j) = sum(self % kernel(n, :, :) * input(:, iws:iwe))
+        self % z(n, j) = sum(self % kernel(n, j, :, :) * input(:, iws:iwe)) + self % biases(n, j)
       end do
-
-      ! Add the bias for each filter.
-      self % z(:, j) = self % z(:, j) + self % biases
     end do
-
-    ! Apply the activation function.
     self % output = self % activation % eval(self % z)
   end subroutine forward
 
   pure module subroutine backward(self, input, gradient)
     implicit none
     class(locally_connected_1d_layer), intent(in out) :: self
-    ! 'input' has shape: (channels, input_width)
-    ! 'gradient' (dL/dy) has shape: (filters, output_width)
     real, intent(in) :: input(:,:)
     real, intent(in) :: gradient(:,:)
-
     integer :: input_channels, input_width, output_width
     integer :: j, n, k
-    integer :: iws, iwe, half_window
-    real :: gdz_val
+    integer :: iws, iwe
+    real :: gdz(self % filters, self % width)
+    real :: db_local(self % filters, self % width)
+    real :: dw_local(self % filters, self % width, self % channels, self % kernel_size)
 
-    ! Local arrays to accumulate gradients.
-    real :: gdz(self % filters, self % width)  ! local gradient (dL/dz)
-    real :: db_local(self % filters)
-    real :: dw_local(self % filters, self % channels, self % kernel_size)
-
-    ! Determine dimensions.
     input_channels = size(input, dim=1)
     input_width    = size(input, dim=2)
-    output_width   = self % width    ! Note: output_width = input_width - kernel_size + 1
-
-    half_window = self % kernel_size / 2
+    output_width   = self % width
 
-    !--- Compute the local gradient gdz = (dL/dy) * sigma'(z) for each output.
     do j = 1, output_width
        gdz(:, j) = gradient(:, j) * self % activation % eval_prime(self % z(:, j))
     end do
 
-    !--- Compute bias gradients: db(n) = sum_j gdz(n, j)
     do n = 1, self % filters
-       db_local(n) = sum(gdz(n, :))
+       do j = 1, output_width
+          db_local(n, j) = gdz(n, j)
+       end do
     end do
 
-    !--- Initialize weight gradient and input gradient accumulators.
     dw_local = 0.0
     self % gradient = 0.0
 
-    !--- Accumulate gradients over each output position.
-    ! In the forward pass the window for output index j was:
-    !   iws = j,  iwe = j + kernel_size - 1.
     do n = 1, self % filters
        do j = 1, output_width
           iws = j
           iwe = j + self % kernel_size - 1
           do k = 1, self % channels
-             ! Weight gradient: accumulate contribution from the input window.
-             dw_local(n, k, :) = dw_local(n, k, :) + input(k, iws:iwe) * gdz(n, j)
-             ! Input gradient: propagate gradient back to the input window.
-             self % gradient(k, iws:iwe) = self % gradient(k, iws:iwe) + self % kernel(n, k, :) * gdz(n, j)
+             dw_local(n, j, k, :) = dw_local(n, j, k, :) + input(k, iws:iwe) * gdz(n, j)
+             self % gradient(k, iws:iwe) = self % gradient(k, iws:iwe) + self % kernel(n, j, k, :) * gdz(n, j)
           end do
        end do
     end do
 
-    !--- Update stored gradients.
     self % dw = self % dw + dw_local
     self % db = self % db + db_local
-
   end subroutine backward
 
   pure module function get_num_params(self) result(num_params)
     class(locally_connected_1d_layer), intent(in) :: self
     integer :: num_params
-    num_params = product(shape(self % kernel)) + size(self % biases)
+    num_params = product(shape(self % kernel)) + product(shape(self % biases))
   end function get_num_params
 
   module function get_params(self) result(params)
     class(locally_connected_1d_layer), intent(in), target :: self
     real, allocatable :: params(:)
-    real, pointer :: w_(:) => null()
-    w_(1:size(self % kernel)) => self % kernel
-    params = [ w_, self % biases ]
+    params = [reshape(self % kernel, [size(self % kernel)]), reshape(self % biases, [size(self % biases)])]
   end function get_params
 
   module function get_gradients(self) result(gradients)
     class(locally_connected_1d_layer), intent(in), target :: self
     real, allocatable :: gradients(:)
-    real, pointer :: dw_(:) => null()
-    dw_(1:size(self % dw)) => self % dw
-    gradients = [ dw_, self % db ]
+    gradients = [reshape(self % dw, [size(self % dw)]), reshape(self % db, [size(self % db)])]
   end function get_gradients
 
   module subroutine set_params(self, params)
@@ -179,7 +144,7 @@ module subroutine set_params(self, params)
 
     self % kernel = reshape(params(:product(shape(self % kernel))), shape(self % kernel))
     associate(n => product(shape(self % kernel)))
-      self % biases = params(n + 1 : n + self % filters)
+      self % biases = reshape(params(n + 1 :), shape(self % biases))
     end associate
 
   end subroutine set_params