layernorm: add gradient updates

OneAdder · OneAdder · commit bdefd02a12a1 · 2025-02-23T17:00:52.000+04:00
diff --git a/src/nf/nf_layernorm.f90 b/src/nf/nf_layernorm.f90
@@ -32,6 +32,10 @@ module nf_layernorm_layer
     procedure :: forward
     procedure :: backward
     procedure :: init
+    procedure :: get_num_params
+    procedure :: get_params
+    procedure :: get_gradients
+    procedure :: set_params
   end type layernorm_layer
 
   interface layernorm_layer
@@ -57,5 +61,28 @@ module subroutine init(self, input_shape)
       class(layernorm_layer), intent(in out) :: self
       integer, intent(in) :: input_shape(:)
     end subroutine init
+
+    pure module function get_num_params(self) result(num_params)
+      class(layernorm_layer), intent(in) :: self
+      integer :: num_params
+    end function get_num_params
+
+
+    module function get_params(self) result(params)
+      class(layernorm_layer), intent(in), target :: self
+      real, allocatable :: params(:)
+    end function get_params
+
+
+    module function get_gradients(self) result(gradients)
+      class(layernorm_layer), intent(in), target :: self
+      real, allocatable :: gradients(:)
+    end function get_gradients
+
+
+    module subroutine set_params(self, params)
+      class(layernorm_layer), intent(in out) :: self
+      real, intent(in), target :: params(:)
+    end subroutine set_params
   end interface
 end module nf_layernorm_layer
diff --git a/src/nf/nf_layernorm_submodule.f90 b/src/nf/nf_layernorm_submodule.f90
@@ -107,4 +107,52 @@ module subroutine init(self, input_shape)
 
     allocate(self % output(self % sequence_length, self % model_dimension))
   end subroutine init
+
+  pure module function get_num_params(self) result(num_params)
+    class(layernorm_layer), intent(in) :: self
+    integer :: num_params
+
+    ! Number of weights times number of biases
+    num_params = 2 * self % model_dimension
+
+  end function get_num_params
+
+
+  module function get_params(self) result(params)
+    class(layernorm_layer), intent(in), target :: self
+    real, allocatable :: params(:)
+
+    params = [ &
+      self % gamma, &
+      self % beta &
+    ]
+
+  end function get_params
+
+
+  module function get_gradients(self) result(gradients)
+    class(layernorm_layer), intent(in), target :: self
+    real, allocatable :: gradients(:)
+
+    gradients = [ &
+      self % d_gamma, &
+      self % d_beta &
+    ]
+
+  end function get_gradients
+
+
+  module subroutine set_params(self, params)
+    class(layernorm_layer), intent(in out) :: self
+    real, intent(in), target :: params(:)
+
+    ! check if the number of parameters is correct
+    if (size(params) /= self % get_num_params()) then
+      error stop 'Error: number of parameters does not match'
+    end if
+
+    self % gamma = params(1: self % model_dimension)
+    self % beta = params(self % model_dimension + 1: 2 * self % model_dimension)
+
+  end subroutine set_params
 end submodule nf_layernorm_layer_submodule
diff --git a/test/test_layernorm.f90 b/test/test_layernorm.f90
@@ -1,6 +1,7 @@
 program test_layernorm
   use iso_fortran_env, only: stderr => error_unit
   use nf_layernorm_layer, only: layernorm_layer
+  use nf, only: sgd
   implicit none
 
   logical :: ok = .true.
@@ -13,6 +14,7 @@ program test_layernorm
 
   call test_layernorm_forward(layernorm, sample_input, ok)
   call test_layernorm_backward(layernorm, sample_input, sample_gradient, ok)
+  call test_layernorm_gradients(sample_input, sample_gradient, ok)
 
   if (ok) then
     print '(a)', 'test_layernorm_layer: All tests passed.'
@@ -90,4 +92,52 @@ subroutine test_layernorm_backward(layernorm, input, gradient, ok)
     end if
   end subroutine test_layernorm_backward
 
+  subroutine test_layernorm_gradients(input, gradient, ok)
+    real, intent(in out) :: input(:, :)
+    real, intent(in out) :: gradient(:, :)
+    logical, intent(in out) :: ok
+    type(layernorm_layer) :: layernorm
+    type(sgd) :: optim
+
+    real :: parameters(8)
+    real :: expected_parameters(8)
+    real :: updated_output(12)
+    real :: expected_updated_output(12) = [&
+        -0.738849819, 0.881645918, -1.03555739,&
+        1.66299772, -1.02966857, 0.908487320,&
+        -0.562230229, 1.01311040, 0.984123051,&
+        -0.564699769, -1.13543355, -1.11444426&
+    ]
+
+    layernorm = layernorm_layer()
+    call layernorm % init([3, 4])
+
+    call layernorm % forward(input)
+    call layernorm % backward(input, gradient)
+
+    if (layernorm % get_num_params() /= 8) then
+      ok = .false.
+      write(stderr, '(a)') 'incorrect number of parameters.. failed'
+    end if
+
+    expected_parameters(1: 4) = 1.
+    expected_parameters(5: 8) = 0.
+    parameters = layernorm % get_params()
+    if (.not. all(parameters.eq.expected_parameters)) then
+      ok = .false.
+      write(stderr, '(a)') 'incorrect parameters.. failed'
+    end if
+
+    optim = SGD(learning_rate=0.01)
+    call optim % minimize(parameters, layernorm % get_gradients())
+    call layernorm % set_params(parameters)
+
+    call layernorm % forward(input)
+
+    updated_output = reshape(layernorm % output, [12])
+    if (.not. all(updated_output.eq.expected_updated_output)) then
+      ok = .false.
+      write(stderr, '(a)') 'incorrect output after parameters update.. failed'
+    end if
+  end subroutine test_layernorm_gradients
 end program test_layernorm