multihead_attention: rename common forward and backward calls

OneAdder · OneAdder · commit 43d24472c62a · 2025-02-16T23:41:33.000+04:00
diff --git a/src/nf/nf_multihead_attention.f90 b/src/nf/nf_multihead_attention.f90
@@ -34,8 +34,8 @@ module nf_multihead_attention_layer
     real, allocatable :: o_input(:, :)
   contains
 
-    procedure :: backward
-    procedure :: forward
+    procedure :: common_backward
+    procedure :: common_forward
     procedure :: split_heads
     procedure :: create_attention_matrix
     procedure :: normalize_attention_matrix
@@ -59,24 +59,24 @@ end function multihead_attention_layer_cons
 
   interface
 
-    module subroutine backward(self, input, gradient)
+    module subroutine common_backward(self, input, gradient)
       !! General backprop for MultiHead Attention mechanism
       !! Might be used for both Self and Cross Attention
       !! Self Attention: sum output gradients
       !! Cross Attention: use them separately
       class(multihead_attention_layer), intent(in out) :: self
       real, intent(in) :: input(:, :)
       real, intent(in) :: gradient(:, :)
-    end subroutine backward
+    end subroutine common_backward
 
-    module subroutine forward(self, query, key, value)
+    module subroutine common_forward(self, query, key, value)
       !! General forward propagation for MultiHead Attention Mechanism
       !! Might be used for both Self and Cross Attention
       !! Self Attention: pass the same value thrice
       !! Cross Attention: pass three values for your query, key and value
       class(multihead_attention_layer), intent(in out) :: self
       real, intent(in) :: query(:, :), key(:, :), value(:, :)
-    end subroutine forward
+    end subroutine common_forward
 
     module subroutine init(self, input_shape)
       !! Initialize the layer data structures.
@@ -114,7 +114,7 @@ module function multihead_attention_layer_cons(sequence_length, model_dimension,
     res % softmax_func = softmax()
   end function multihead_attention_layer_cons
 
-  module subroutine backward(self, input, gradient)
+  module subroutine common_backward(self, input, gradient)
     class(multihead_attention_layer), intent(in out) :: self
     real, intent(in) :: input(:, :)
     real, intent(in) :: gradient(:, :)
@@ -210,9 +210,9 @@ module subroutine backward(self, input, gradient)
     deallocate(d_normalize)
     deallocate(dq)
     deallocate(dk)
-  end subroutine backward
+  end subroutine common_backward
 
-  module subroutine forward(self, query, key, value)
+  module subroutine common_forward(self, query, key, value)
     class(multihead_attention_layer), intent(in out) :: self
     real, intent(in) :: query(:, :), key(:, :), value(:, :)
 
@@ -254,7 +254,7 @@ module subroutine forward(self, query, key, value)
     deallocate(q)
     deallocate(k)
     deallocate(v)
-  end subroutine forward
+  end subroutine common_forward
 
   module function split_heads(self, input) result(output)
     !! Split inputs into heads
diff --git a/test/test_multihead_attention_layer.f90 b/test/test_multihead_attention_layer.f90
@@ -14,7 +14,7 @@ program test_multihead_attention_layer
 
   attention = multihead_attention_layer(sequence_length=3, model_dimension=4, n_heads=2)
   call attention % init([0])
-!
+
   call test_multihead_attention_split_heads(attention, sample_input, ok, split_heads_output)
   call test_multihead_attention_create_attention_matrix(attention, split_heads_output, ok)
   call test_multihead_attention_normalization(attention, ok)
@@ -23,7 +23,7 @@ program test_multihead_attention_layer
   call test_multihead_attention_forward(attention, ok)
   call test_multihead_attention_backward(attention, ok)
   call test_multihead_attention_update_gradients(attention, ok)
-!  call test_multihead_attention_forward_reallife_shape(ok)
+  call test_multihead_attention_forward_reallife_shape(ok)
 
 contains
   subroutine test_multihead_attention_split_heads(attention, input, ok, output)
@@ -156,7 +156,7 @@ subroutine test_multihead_attention_forward(attention, ok)
         0.447508544, 0.464612424, 0.464721352, 0.473546445, 0.512576580, 0.513393998&
     ]
 
-    call attention % forward(input, input, input)
+    call attention % common_forward(input, input, input)
 
     output_shape = shape(attention % output)
     if (.not. all(output_shape.eq.expected_shape)) then
@@ -196,7 +196,7 @@ subroutine test_multihead_attention_forward_reallife_shape(ok)
     attention = multihead_attention_layer(sequence_length=148, model_dimension=512, n_heads=8)
     call attention % init([0])
 
-    call attention % forward(input, input, input)
+    call attention % common_forward(input, input, input)
 
     output_shape = shape(attention % output)
     if (.not. all(output_shape.eq.expected_shape)) then
@@ -221,7 +221,7 @@ subroutine test_multihead_attention_backward(attention, ok)
     real :: output_flat(12)
     real :: output_shape(2)
 
-    call attention % backward(input, gradient)
+    call attention % common_backward(input, gradient)
 
     ! sample for Self Attention: sum of output gradients
     ! FIXME: remove reshapes when linear2d situation is resolved
@@ -271,7 +271,7 @@ subroutine test_multihead_attention_update_gradients(attention, ok)
     call optim % minimize(parameters, attention % get_gradients())
     call attention % set_params(parameters)
 
-    call attention % forward(&
+    call attention % common_forward(&
         reshape([0.0, 10.1, 0.2, 10.3, 0.4, 10.5, 0.6, 10.7, 10.8, 0.9, 0.11, 0.12], [3, 4]),&
         reshape([0.0, 10.1, 0.2, 10.3, 0.4, 10.5, 0.6, 10.7, 10.8, 0.9, 0.11, 0.12], [3, 4]),&
         reshape([0.0, 10.1, 0.2, 10.3, 0.4, 10.5, 0.6, 10.7, 10.8, 0.9, 0.11, 0.12], [3, 4])&