add geglu backward (#1069)

parsshar-RH · web-flow · commit 5aad25100f77 · 2025-11-03T20:58:38.000-08:00
diff --git a/examples/geglu.py b/examples/geglu.py
@@ -36,6 +36,7 @@
 
 if TYPE_CHECKING:
     from collections.abc import Callable
+    from typing import Any
 
 
 # %%
@@ -104,6 +105,71 @@ def geglu(a: Tensor, b: Tensor) -> Tensor:
     return out
 
 
+@helion.kernel()
+def geglu_bwd(grad_out: Tensor, a: Tensor, b: Tensor) -> tuple[Tensor, Tensor]:
+    grad_a = torch.empty_like(a)
+    grad_b = torch.empty_like(b)
+
+    grad_out_flat = grad_out.view(-1)
+    a_flat = a.view(-1)
+    b_flat = b.view(-1)
+    grad_a_flat = grad_a.view(-1)
+    grad_b_flat = grad_b.view(-1)
+
+    for tile_idx in hl.tile(a.numel()):
+        a_vals = a_flat[tile_idx].to(torch.float32)
+        b_vals = b_flat[tile_idx].to(torch.float32)
+        grad_out_vals = grad_out_flat[tile_idx].to(torch.float32)
+
+        sqrt_2_over_pi = 0.7978845608028654
+
+        a_cubed = a_vals * a_vals * a_vals
+        tanh_arg = sqrt_2_over_pi * (a_vals + 0.044715 * a_cubed)
+        tanh_result = torch.tanh(tanh_arg)
+        gelu_a = 0.5 * a_vals * (1.0 + tanh_result)
+
+        grad_b_vals = grad_out_vals * gelu_a
+        grad_b_flat[tile_idx] = grad_b_vals.to(b.dtype)
+
+        dz_da = sqrt_2_over_pi * (1.0 + 0.134145 * a_vals * a_vals)
+        sech_sq = 1.0 - tanh_result * tanh_result
+
+        dgelu_da = 0.5 * (1.0 + tanh_result) + 0.5 * a_vals * sech_sq * dz_da
+
+        grad_a_vals = grad_out_vals * b_vals * dgelu_da
+        grad_a_flat[tile_idx] = grad_a_vals.to(a.dtype)
+
+    return grad_a, grad_b
+
+
+class GEGLUFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx: Any,  # noqa: ANN401
+        a: Tensor,
+        b: Tensor,
+    ) -> Tensor:
+        """Forward pass for GEGLU."""
+        out = geglu(a, b)
+        ctx.save_for_backward(a, b)
+        return out
+
+    @staticmethod
+    def backward(  # type: ignore[override]
+        ctx: Any,  # noqa: ANN401
+        grad_out: Tensor,
+    ) -> tuple[Tensor, Tensor]:
+        """Backward pass for GEGLU."""
+        a, b = ctx.saved_tensors
+        grad_a, grad_b = geglu_bwd(grad_out, a, b)
+        return grad_a, grad_b
+
+
+def geglu_autograd(a: Tensor, b: Tensor) -> Tensor:
+    """GEGLU with forward + backward support."""
+    return GEGLUFunction.apply(a, b)  # type: ignore[no-any-return]
+
+
 # %%
 # GEGLU MLP Module (matches liger_kernel structure)
 # -------------------------------------------------
@@ -167,9 +233,6 @@ def check_geglu_kernel(shape: tuple[int, ...]) -> None:
     Args:
         shape: Shape of the input tensors to test.
     """
-    # Create test tensors
-    a = torch.randn(shape, device=DEVICE, dtype=torch.float16)
-    b = torch.randn(shape, device=DEVICE, dtype=torch.float16)
 
     def baseline_geglu(a: Tensor, b: Tensor) -> Tensor:
         """
@@ -178,8 +241,26 @@ def baseline_geglu(a: Tensor, b: Tensor) -> Tensor:
         """
         return nn.functional.gelu(a, approximate="tanh").to(b.dtype) * b
 
+    print("\n=== Forward Pass Test ===")
+    a = torch.randn(shape, device=DEVICE, dtype=torch.float16)
+    b = torch.randn(shape, device=DEVICE, dtype=torch.float16)
     run_example(geglu, baseline_geglu, (a, b))
 
+    # Test forward + backward pass
+    print("\n\n=== Forward + Backward Pass Test ===")
+    a_grad = torch.randn(shape, device=DEVICE, dtype=torch.float16, requires_grad=True)
+    b_grad = torch.randn(shape, device=DEVICE, dtype=torch.float16, requires_grad=True)
+    run_example(
+        geglu_autograd,
+        baseline_geglu,
+        (a_grad, b_grad),
+        kernel_name="helion_autograd",
+        baseline_name="torch",
+        rtol=1e-2,
+        atol=1e-1,
+        bwd=True,
+    )
+
 
 class BaselineMLP(nn.Module):
     def __init__(self, config: Config) -> None:
@@ -303,11 +384,11 @@ def main() -> None:
     kernel_test_shapes = [(8, 2048, 4096), (8, 4096, 8192)]
 
     for shape in kernel_test_shapes:
-        print(f"Testing GEGLU kernel shape: {shape}")
+        print(f"\nTesting GEGLU kernel shape: {shape}")
         check_geglu_kernel(shape)
         print(f"✓ GEGLU kernel shape {shape} passed")
 
-    print("\nTesting GEGLU MLP...")
+    print("\n\nTesting GEGLU MLP...")
 
     # Test GEGLU MLP with transformer-typical sizes
     mlp_test_configs = [
@@ -317,7 +398,7 @@ def main() -> None:
 
     for batch_size, seq_len, hidden_size, intermediate_size in mlp_test_configs:
         print(
-            f"Testing GEGLU MLP: B={batch_size}, T={seq_len}, H={hidden_size}, I={intermediate_size}"
+            f"\nTesting GEGLU MLP: B={batch_size}, T={seq_len}, H={hidden_size}, I={intermediate_size}"
         )
         check_geglu_mlp(batch_size, seq_len, hidden_size, intermediate_size)
         print("✓ GEGLU MLP config passed")
diff --git a/test/test_examples.expected b/test/test_examples.expected
@@ -1756,6 +1756,105 @@ def geglu(a: Tensor, b: Tensor, *, _launcher=_default_launcher):
     # src[geglu.py:N]: return out
     return out
 
+--- assertExpectedJournal(TestExamples.test_geglu_bwd)
+from __future__ import annotations
+
+import torch
+import triton
+import triton.language as tl
+from helion.runtime import default_launcher as _default_launcher
+
+@triton.jit
+def _helion_geglu_bwd(a_flat, b_flat, grad_out_flat, grad_b_flat, grad_a_flat, _BLOCK_SIZE_0: tl.constexpr):
+    # src[geglu.py:N]: for tile_idx in hl.tile(a.numel()):
+    pid_0 = tl.program_id(0)
+    offset_0 = pid_0 * _BLOCK_SIZE_0
+    indices_0 = (offset_0 + tl.arange(0, _BLOCK_SIZE_0)).to(tl.int32)
+    # src[geglu.py:N]: a_vals = a_flat[tile_idx].to(torch.float32)
+    load = tl.load(a_flat + indices_0 * 1, None)
+    v_0 = tl.cast(load, tl.float32)
+    # src[geglu.py:N]: b_vals = b_flat[tile_idx].to(torch.float32)
+    load_1 = tl.load(b_flat + indices_0 * 1, None)
+    v_1 = tl.cast(load_1, tl.float32)
+    # src[geglu.py:N]: grad_out_vals = grad_out_flat[tile_idx].to(torch.float32)
+    load_2 = tl.load(grad_out_flat + indices_0 * 1, None)
+    v_2 = tl.cast(load_2, tl.float32)
+    # src[geglu.py:N]: a_cubed = a_vals * a_vals * a_vals
+    v_3 = v_0 * v_0
+    v_4 = v_3 * v_0
+    # src[geglu.py:N]: tanh_arg = sqrt_2_over_pi * (a_vals + 0.044715 * a_cubed)
+    v_5 = 0.044715
+    v_6 = v_4 * v_5
+    v_7 = v_0 + v_6
+    v_8 = 0.7978845608028654
+    v_9 = v_7 * v_8
+    # src[geglu.py:N]: tanh_result = torch.tanh(tanh_arg)
+    v_10 = libdevice.tanh(v_9)
+    # src[geglu.py:N]: gelu_a = 0.5 * a_vals * (1.0 + tanh_result)
+    v_11 = 0.5
+    v_12 = v_0 * v_11
+    v_13 = 1.0
+    v_14 = v_10 + v_13
+    v_15 = v_12 * v_14
+    # src[geglu.py:N]: grad_b_vals = grad_out_vals * gelu_a
+    v_16 = v_2 * v_15
+    # src[geglu.py:N]: grad_b_flat[tile_idx] = grad_b_vals.to(b.dtype)
+    v_17 = tl.cast(v_16, tl.bfloat16)
+    tl.store(grad_b_flat + indices_0 * 1, v_17, None)
+    # src[geglu.py:N]: dz_da = sqrt_2_over_pi * (1.0 + 0.134145 * a_vals * a_vals)
+    v_18 = 0.134145
+    v_19 = v_0 * v_18
+    v_20 = v_19 * v_0
+    v_21 = 1.0
+    v_22 = v_20 + v_21
+    v_23 = 0.7978845608028654
+    v_24 = v_22 * v_23
+    # src[geglu.py:N]: sech_sq = 1.0 - tanh_result * tanh_result
+    v_25 = v_10 * v_10
+    v_26 = 1.0
+    v_27 = v_26 - v_25
+    # src[geglu.py:N]: dgelu_da = 0.5 * (1.0 + tanh_result) + 0.5 * a_vals * sech_sq * dz_da
+    v_28 = 1.0
+    v_29 = v_10 + v_28
+    v_30 = 0.5
+    v_31 = v_29 * v_30
+    v_32 = 0.5
+    v_33 = v_0 * v_32
+    v_34 = v_33 * v_27
+    v_35 = v_34 * v_24
+    v_36 = v_31 + v_35
+    # src[geglu.py:N]: grad_a_vals = grad_out_vals * b_vals * dgelu_da
+    v_37 = v_2 * v_1
+    v_38 = v_37 * v_36
+    # src[geglu.py:N]: grad_a_flat[tile_idx] = grad_a_vals.to(a.dtype)
+    v_39 = tl.cast(v_38, tl.bfloat16)
+    tl.store(grad_a_flat + indices_0 * 1, v_39, None)
+
+def geglu_bwd(grad_out: Tensor, a: Tensor, b: Tensor, *, _launcher=_default_launcher):
+    # src[geglu.py:N]: grad_a = torch.empty_like(a)
+    grad_a = torch.empty_like(a)
+    # src[geglu.py:N]: grad_b = torch.empty_like(b)
+    grad_b = torch.empty_like(b)
+    # src[geglu.py:N]: grad_out_flat = grad_out.view(-1)
+    grad_out_flat = grad_out.view(-1)
+    # src[geglu.py:N]: a_flat = a.view(-1)
+    a_flat = a.view(-1)
+    # src[geglu.py:N]: b_flat = b.view(-1)
+    b_flat = b.view(-1)
+    # src[geglu.py:N]: grad_a_flat = grad_a.view(-1)
+    grad_a_flat = grad_a.view(-1)
+    # src[geglu.py:N]: grad_b_flat = grad_b.view(-1)
+    grad_b_flat = grad_b.view(-1)
+    # src[geglu.py:N]: for tile_idx in hl.tile(a.numel()):
+    _BLOCK_SIZE_0 = 16
+    # src[geglu.py:N]: for tile_idx in hl.tile(a.numel()):
+    # src[geglu.py:N]:     a_vals = a_flat[tile_idx].to(torch.float32)
+    # src[geglu.py:N]:     b_vals = b_flat[tile_idx].to(torch.float32)
+    # src[geglu.py:N-N]: ...
+    _launcher(_helion_geglu_bwd, (triton.cdiv(1024, _BLOCK_SIZE_0),), a_flat, b_flat, grad_out_flat, grad_b_flat, grad_a_flat, _BLOCK_SIZE_0, num_warps=4, num_stages=3)
+    # src[geglu.py:N]: return grad_a, grad_b
+    return (grad_a, grad_b)
+
 --- assertExpectedJournal(TestExamples.test_grouped_gemm_jagged)
 from __future__ import annotations
 
diff --git a/test/test_examples.py b/test/test_examples.py
@@ -1227,6 +1227,30 @@ def test_geglu(self):
             )
         )
 
+    def test_geglu_bwd(self):
+        x1, x2 = [
+            torch.randn(1024, device=DEVICE, dtype=torch.bfloat16, requires_grad=True)
+            for _ in range(2)
+        ]
+
+        out = torch.nn.functional.gelu(x1, approximate="tanh") * x2
+        grad_out = torch.randn_like(out)
+        out.backward(grad_out)
+
+        args = (grad_out, x1, x2)
+
+        self.assertExpectedJournal(
+            check_example(
+                "geglu",
+                args,
+                (x1.grad, x2.grad),
+                fn_name="geglu_bwd",
+                block_sizes=[16],
+                num_warps=4,
+                num_stages=3,
+            )
+        )
+
     def test_swiglu(self):
         args = (
             torch.randn([1024, 1024], device=DEVICE, dtype=torch.float16),