layernorm: initial implementation

OneAdder · OneAdder · commit 0f0e1b29419e · 2025-02-21T22:08:54.000+04:00
diff --git a/src/nf/layernorm.f90 b/src/nf/layernorm.f90
@@ -0,0 +1,159 @@
+module nf_layernorm_layer
+  use nf_activation, only: activation_function
+  use nf_base_layer, only: base_layer
+
+  implicit none
+
+  private
+  public :: layernorm_layer
+
+  type, extends(base_layer) :: layernorm_layer
+    !! Layer Normalization
+    !! ((x − mean(x)) / sqrt(variance(x) + eps) * gamma + beta
+    !! Based upon `Ba, Jimmy Lei, Jamie Ryan Kiros, and Geoffrey E. Hinton(2016)`:
+    !! https://arxiv.org/abs/1607.06450v1
+    integer :: sequence_length
+    integer :: model_dimension
+
+    real :: eps
+    real, allocatable :: gamma(:)
+    real, allocatable :: beta(:)
+
+    real, allocatable :: d_gamma(:)
+    real, allocatable :: d_beta(:)
+    real, allocatable :: gradient(:, :)
+
+    real, allocatable :: mu(:, :)
+    real, allocatable :: sigma(:)
+
+    real, allocatable :: output(:, :)
+
+  contains
+    procedure :: forward
+    procedure :: backward
+    procedure :: spread_by_sequence
+    procedure :: spread_by_model_dim
+    procedure :: init
+  end type layernorm_layer
+
+  interface layernorm_layer
+    module function layernorm_layer_cons(sequence_length, model_dimension) &
+      result(res)
+      integer, intent(in) :: sequence_length, model_dimension
+      type(layernorm_layer) :: res
+    end function layernorm_layer_cons
+  end interface layernorm_layer
+
+contains
+  module function layernorm_layer_cons(sequence_length, model_dimension) &
+    result(res)
+    integer, intent(in) :: sequence_length, model_dimension
+    type(layernorm_layer) :: res
+
+    res % sequence_length = sequence_length
+    res % model_dimension = model_dimension
+    res % eps = 1e-5
+  end function layernorm_layer_cons
+
+  pure module subroutine forward(self, input)
+    class(layernorm_layer), intent(in out) :: self
+    real, intent(in) :: input(:, :)
+    real, allocatable :: normalized(:, :)
+    integer :: i
+
+    allocate(normalized(self % sequence_length, self % model_dimension))
+
+    ! mu = x - MEAN_last_dim(x)
+    do concurrent(i = 1: self % model_dimension)
+      self % mu(:, i) = input(:, i) - (sum(input, dim=2) / self % model_dimension)
+    end do
+
+    ! square root of variance shifted be eps
+    self % sigma = sqrt((sum(self % mu ** 2, dim=2) / self % model_dimension) + self % eps)
+
+    ! normalize mu by variance by first axis
+    do concurrent(i = 1: self % model_dimension)
+      normalized(:, i) = self % mu(:, i) / self % sigma
+    end do
+
+    ! forward through trainable params gamma and beta
+    do concurrent(i = 1: self % sequence_length)
+      self % output(i, :) = normalized(i, :) * self % gamma + self % beta
+    end do
+
+    deallocate(normalized)
+  end subroutine forward
+
+  pure module subroutine backward(self, input, gradient)
+    class(layernorm_layer), intent(in out) :: self
+    real, intent(in) :: input(:, :)
+    real, intent(in) :: gradient(:, :)
+    real, allocatable :: one_over_sigma(:, :)
+    real, allocatable :: gradient_by_gamma_over_sigma(:, :)
+
+    allocate(one_over_sigma(self % sequence_length, self % model_dimension))
+    allocate(gradient_by_gamma_over_sigma(self % sequence_length, self % model_dimension))
+
+    one_over_sigma = (1 / self % spread_by_model_dim(self % sigma))
+    gradient_by_gamma_over_sigma = gradient * self % spread_by_sequence(self % gamma) * one_over_sigma
+
+    ! d_output/d_gamma = sum(d_output/d_y * mu/sigma)
+    self % d_gamma = sum(gradient * self % mu * one_over_sigma, dim=1)
+
+    ! d_output/d_beta = sum(d_output/d_y) * 1
+    self % d_beta = sum(gradient, dim=1)
+
+    ! From this article:
+    ! https://robotchinwag.com/posts/layer-normalization-deriving-the-gradient-for-the-backward-pass/
+    ! d_output/d_x = d_output/d_y * gamma/sigma
+    !     - d_output/d_y
+    !     - sum(d_output/d_y * gamma/sigma) / len
+    !     - mu * sum(d_output/d_y * gamma * mu * sigma^(03)) / len
+    self % gradient = &
+        gradient_by_gamma_over_sigma &
+        - self % spread_by_model_dim(sum(gradient_by_gamma_over_sigma, dim=2)) / self % model_dimension &
+        - self % mu * self % spread_by_model_dim(sum(&
+            gradient_by_gamma_over_sigma * self % mu * (one_over_sigma ** 2),&
+            dim=2)&
+        ) / self % model_dimension
+
+    deallocate(one_over_sigma)
+    deallocate(gradient_by_gamma_over_sigma)
+  end subroutine backward
+
+  pure function spread_by_sequence(self, input) result(output)
+    class(layernorm_layer), intent(in) :: self
+    real, intent(in) :: input(:)
+    real :: output(self % sequence_length, self % model_dimension)
+
+    output = spread(input, dim=1, ncopies=self % sequence_length)
+  end function spread_by_sequence
+
+  pure function spread_by_model_dim(self, input) result(output)
+    class(layernorm_layer), intent(in) :: self
+    real, intent(in) :: input(:)
+    real :: output(self % sequence_length, self % model_dimension)
+
+    output = spread(input, dim=2, ncopies=self % model_dimension)
+  end function spread_by_model_dim
+
+  module subroutine init(self, input_shape)
+    class(layernorm_layer), intent(in out) :: self
+    integer, intent(in) :: input_shape(:)
+
+    ! default initialization from PyTorch
+    allocate(self % gamma(self % model_dimension))
+    self % gamma = 1.
+    allocate(self % beta(self % model_dimension))
+    self % beta = 0.
+
+    allocate(self % d_gamma(self % model_dimension))
+    allocate(self % d_beta(self % model_dimension))
+    allocate(self % gradient(self % sequence_length, self % model_dimension))
+
+    allocate(self % mu(self % sequence_length, self % model_dimension))
+    allocate(self % sigma(self % sequence_length))
+
+    allocate(self % output(self % sequence_length, self % model_dimension))
+  end subroutine init
+end module nf_layernorm_layer
diff --git a/test/test_layernorm.f90 b/test/test_layernorm.f90
@@ -0,0 +1,86 @@
+program test_layernorm
+  use iso_fortran_env, only: stderr => error_unit
+  use nf_layernorm_layer, only: layernorm_layer
+  implicit none
+
+  logical :: ok = .true.
+  type(layernorm_layer) :: layernorm
+  real :: sample_input(3, 4) = reshape([0.0, 10.1, 0.2, 10.3, 0.4, 10.5, 0.6, 10.7, 10.8, 0.9, 0.11, 0.12], [3, 4])
+  real :: sample_gradient(3, 4) = reshape([0.1, 3., 2., 0.1, 3., 3., 0.1, 2., 0.1, 3., 0.1, 3.], [3, 4])
+
+  layernorm = layernorm_layer(3, 4)
+  call layernorm % init([0])
+
+  call test_layernorm_forward(layernorm, sample_input, ok)
+  call test_layernorm_backward(layernorm, sample_input, sample_gradient, ok)
+
+contains
+  subroutine test_layernorm_forward(layernorm, input, ok)
+    type(layernorm_layer), intent(in out) :: layernorm
+    real, intent(in out) :: input(:, :)
+    logical, intent(in out) :: ok
+    real :: output_shape(2)
+    real :: output_flat(12)
+    real :: expected_shape(2) = [3, 4]
+    real :: expected_output_flat(12) = [&
+        -0.693158746, 0.939844191, -0.992156327, 1.72702277, -0.970368207, 0.971188426,&
+        -0.552177250, 1.05800152, 1.02837324, -0.481686622, -1.02747762, -1.00740564&
+    ]
+
+    call layernorm % forward(input)
+
+    output_shape = shape(layernorm % output)
+    if (.not. all(output_shape.eq.expected_shape)) then
+      ok = .false.
+      write(stderr, '(a)') 'forward returned incorrect shape.. failed'
+    end if
+    output_flat = reshape(layernorm % output, shape(output_flat))
+    if (.not. all(output_flat.eq.expected_output_flat)) then
+      ok = .false.
+      write(stderr, '(a)') 'forward returned incorrect values.. failed'
+    end if
+  end subroutine test_layernorm_forward
+
+  subroutine test_layernorm_backward(layernorm, input, gradient, ok)
+    type(layernorm_layer), intent(in out) :: layernorm
+    real, intent(in out) :: input(:, :)
+    real, intent(in out) :: gradient(:, :)
+    logical, intent(in out) :: ok
+
+    real :: gradient_shape(2)
+    real :: gradient_flat(12)
+    real :: expected_gradient_shape(2) = [3, 4]
+    real :: expected_gradient_flat(12) = [&
+        -0.227230772, 0.103088334, -9.88590196E-02, -2.86390483E-02, 0.283811331, 0.277955681,&
+        -0.215662330, -0.105019525, -0.269407451, 0.471532196, -0.281880081, 9.03107598E-02&
+    ]
+
+    real :: d_gamma(4)
+    real :: expected_d_gamma(4) = [0.765904069, 0.175162792,  2.16362262, -4.57002449]
+    real :: d_beta(4)
+    real :: expected_d_beta(4) = [5.09999990, 6.09999990, 2.19999981, 6.09999990]
+
+    call layernorm % backward(input, gradient)
+
+    gradient_shape = shape(layernorm % gradient)
+    if (.not. all(gradient_shape.eq.expected_gradient_shape)) then
+      ok = .false.
+      write(stderr, '(a)') 'backward returned incorrect gradient shape.. failed'
+    end if
+    gradient_flat = reshape(layernorm % gradient, shape(gradient_flat))
+    if (.not. all(gradient_flat.eq.expected_gradient_flat)) then
+      ok = .false.
+      write(stderr, '(a)') 'backward returned incorrect gradient values.. failed'
+    end if
+
+    if (.not. all(layernorm % d_gamma.eq.expected_d_gamma)) then
+      ok = .false.
+      write(stderr, '(a)') 'backward returned incorrect d_gamma values.. failed'
+    end if
+    if (.not. all(layernorm % d_beta.eq.expected_d_beta)) then
+      ok = .false.
+      write(stderr, '(a)') 'backward returned incorrect d_beta values.. failed'
+    end if
+  end subroutine test_layernorm_backward
+
+end program test_layernorm