multihead_attention: proof of concept backward (works, but not mathematically correct)

OneAdder · OneAdder · commit 4005a30c132b · 2025-02-08T12:20:49.000+04:00
diff --git a/src/nf/nf_multihead_attention.f90 b/src/nf/nf_multihead_attention.f90
@@ -26,9 +26,12 @@ module nf_multihead_attention_layer
     real, allocatable :: sdpa(:, :, :, :)
     real, allocatable :: output(:, :, :)
 
+    real, allocatable :: q_input(:, :, :)
+    real, allocatable :: k_input(:, :, :)
+    real, allocatable :: v_input(:, :, :)
   contains
 
-!    procedure :: backward
+    procedure :: backward
     procedure :: forward
     procedure :: split_heads
     procedure :: create_attention_matrix
@@ -49,15 +52,15 @@ end function multihead_attention_layer_cons
 
   interface
 
-    pure module subroutine backward(self, input, gradient)
+    module subroutine backward(self, input, gradient)
       !! Apply the backward gradient descent pass.
       !! Only weight and bias gradients are updated in this subroutine,
       !! while the weights and biases themselves are untouched.
       class(multihead_attention_layer), intent(in out) :: self
         !! Dense layer instance
-      real, intent(in) :: input(:)
+      real, intent(in) :: input(:, :, :)
         !! Input from the previous layer
-      real, intent(in) :: gradient(:)
+      real, intent(in) :: gradient(:, :, :)
         !! Gradient from the next layer
     end subroutine backward
 
@@ -109,6 +112,20 @@ module function multihead_attention_layer_cons(&
     res % softmax_func = softmax()
   end function multihead_attention_layer_cons
 
+  module subroutine backward(self, input, gradient)
+    class(multihead_attention_layer), intent(in out) :: self
+    real, intent(in) :: input(:, :, :)
+    real, intent(in) :: gradient(:, :, :)
+
+    call self % output_layer % backward(input, gradient)
+
+    ! FIXME: calculate gradient for softmax
+
+    call self % value_layer % backward(self % v_input, self % output_layer % gradient)
+    call self % key_layer % backward(self % k_input, self % output_layer % gradient)
+    call self % query_layer % backward(self % q_input, self % output_layer % gradient)
+  end subroutine backward
+
   module subroutine forward(self, query, key, value)
     class(multihead_attention_layer), intent(in out) :: self
     real, intent(in) :: query(:, :, :), key(:, :, :), value(:, :, :)
@@ -119,6 +136,10 @@ module subroutine forward(self, query, key, value)
     real :: attention_matrix(self % n_heads, self % sequence_length, self % sequence_length, self % batch_size)
     real :: dot_product_attention(self % n_heads, self % sequence_length, self % head_size, self % batch_size)
 
+    self % q_input = query
+    self % k_input = key
+    self % v_input = value
+
     call self % query_layer % forward(query)
     call self % key_layer % forward(key)
     call self % value_layer % forward(value)
@@ -237,5 +258,9 @@ module subroutine init(self, input_shape)
         self % n_heads, self % sequence_length, self % head_size, self % batch_size&
     ))
     allocate(self % output(self % sequence_length, self % model_dimension, self % batch_size))
+
+    allocate(self % q_input(self % sequence_length, self % model_dimension, self % batch_size))
+    allocate(self % k_input(self % sequence_length, self % model_dimension, self % batch_size))
+    allocate(self % v_input(self % sequence_length, self % model_dimension, self % batch_size))
   end subroutine init
 end module nf_multihead_attention_layer
diff --git a/test/test_multihead_attention_layer.f90 b/test/test_multihead_attention_layer.f90
@@ -19,6 +19,7 @@ program test_multihead_attention_layer
   call test_multihead_attention_combine_heads(attention, attention % sdpa, ok)
   call test_multihead_attention_forward(attention, ok)
   call test_multihead_attention_forward_reallife_shape(ok)
+  call test_multihead_attention_backward(attention, ok)
 
 contains
   subroutine test_multihead_attention_split_heads(attention, input, ok, output)
@@ -183,4 +184,23 @@ subroutine test_multihead_attention_forward_reallife_shape(ok)
       write(stderr, '(a)') 'forward returned incorrect shape.. failed'
     end if
   end subroutine test_multihead_attention_forward_reallife_shape
+
+  subroutine test_multihead_attention_backward(attention, ok)
+    type(multihead_attention_layer), intent(in out) :: attention
+    logical, intent(in out) :: ok
+    real :: input(3, 4, 1) = reshape([0.0, 10.1, 0.2, 10.3, 0.4, 10.5, 0.6, 10.7, 10.8, 0.9, 0.11, 0.12], [3, 4, 1])
+    real :: gradient(3, 4, 1) = reshape(&
+        [.1, .1, .1, 3., 3., 3., 2., .1, 2., 3., .1, 3., 2., 2., .1, 3., 3., 3.], [3, 4, 1]&
+    )
+    real :: expected_shape(3) = [3, 4, 1]
+    real :: output_shape(3)
+
+    call attention % backward(input, gradient)
+
+    output_shape = shape(attention % output_layer % gradient)
+    if (.not. all(output_shape.eq.expected_shape)) then
+      ok = .false.
+      write(stderr, '(a)') 'backward returned incorrect shape.. failed'
+    end if
+  end subroutine test_multihead_attention_backward
 end program test_multihead_attention_layer