ROCm · matthiasdiener · Dec 9, 2025 · Dec 11, 2025 · Dec 18, 2025 · Jan 13, 2026
@@ -0,0 +1,90 @@
+import os
+import time
+import torch
+import transformer_engine.pytorch as te
+
+torch.manual_seed(0)
+
+os.environ["NVTE_USE_CUTLASS_GROUPED_GEMM"] = "1"
+os.environ["NVTE_CUTLASS_GROUPED_GEMM_WARN_FALLBACK"] = "1"
+
+device = "cuda"
+dtype  = torch.bfloat16
+
+E = 4
+K = 1024
+N = 2048
+m_splits = [128, 64, 0, 256]
+M_total = sum(m_splits)
+
+x = torch.randn(M_total, K, device=device, dtype=dtype)
+
+# Timing helper
+def bench_cuda(fn, warmup=20, iters=100):
+    # Warmup
+    for _ in range(warmup):
+        fn()
+    torch.cuda.synchronize()
+
+    # Timed
+    start = time.time()
+    for _ in range(iters):
+        fn()
+    torch.cuda.synchronize()
+    end = time.time()
+
+    avg_ms = (end - start) * 1000.0 / iters
+    return avg_ms
+
+# TE GroupedLinear
+glinear = te.GroupedLinear(E, K, N, bias=False).to(device=device, dtype=dtype)
+
+def te_run():
+    return glinear(x, m_splits=m_splits)
+
+te_ms = bench_cuda(te_run, warmup=20, iters=100)
+
+# Grab weights for reference path
+Ws = [getattr(glinear, f"weight{e}") for e in range(E)]  # each [N, K]
+W = torch.stack(Ws, dim=0)                               # [E, N, K]
+assert W.shape == (E, N, K), f"Unexpected weight shape: {W.shape}"
+
+# Torch reference (group loop)
+offsets = []
+off = 0
+for m in m_splits:
+    offsets.append(off)
+    off += m
+
+y_ref_buf = torch.empty((M_total, N), device=device, dtype=dtype)
+
+def torch_run():
+    # Fill the preallocated buffer
+    for e, m in enumerate(m_splits):
+        if m == 0:
+            continue
+        o = offsets[e]
+        y_ref_buf[o:o+m].copy_(x[o:o+m] @ W[e].transpose(0, 1))
+    return y_ref_buf
+
+torch_ms = bench_cuda(torch_run, warmup=20, iters=100)
+
+# Compare outputs
+y_te = te_run()
+y_ref = torch_run().clone()
+
+diff = (y_te.float() - y_ref.float())
+max_abs = diff.abs().max().item()
+rel = (diff.abs() / (y_ref.float().abs() + 1e-6)).max().item()
+
+print(f"Errors:")
+print(f"  {y_te.shape=}, {y_ref.shape=}")
+print("  max_abs_err:", max_abs)
+print("  max_rel_err:", rel)
+
+torch.testing.assert_close(y_te.float(), y_ref.float(), rtol=3e-2, atol=3e-2)
+
+print(f"\nTiming:")
+print(f"  TE avg:    {te_ms:.3f} ms")
+print(f"  Torch avg: {torch_ms:.3f} ms")
+print(f"  Speedup:   {torch_ms/te_ms:.2f}x (Torch / TE)")
@@ -28,7 +28,7 @@
     is_bf16_compatible,
 )
 if IS_HIP_EXTENSION:
-    from transformer_engine.pytorch.utils import is_mi200, is_mi308
+    from transformer_engine.pytorch.utils import is_mi200, is_mi308, is_mi300_class
 
 from transformer_engine.pytorch import (
     DotProductAttention,
@@ -148,7 +148,7 @@ def rocm_attn_backend() -> tuple[bool, bool, bool]:
 
 use_cutlass_grouped_gemm = [False]
 # Only enable cutlass grouped gemm on Hopper
-if torch.cuda.get_device_capability() == (9, 0):
+if torch.cuda.get_device_capability() == (9, 0) or IS_HIP_EXTENSION:
     use_cutlass_grouped_gemm.append(True)
 
 
@@ -1386,7 +1386,7 @@ def test_linear_accuracy_delay_wgrad_compute(dtype, bs, model, bias, fuse_wgrad_
 
     if IS_HIP_EXTENSION:
         if dtype not in (torch.float32,) and fuse_wgrad_accumulation and bias:
-            pytest.skip(f"Rocm does not support fused wgrad accumulation for {dtype}.")
+            pytest.skip(f"ROCm does not support fused wgrad accumulation for {dtype}.")
 
     te_linear_ref = Linear(
         config.hidden_size,
@@ -1678,7 +1678,7 @@ def test_layernorm_linear_accuracy_delay_wgrad_compute(
 ):
     if IS_HIP_EXTENSION:
         if dtype not in (torch.float32,) and fuse_wgrad_accumulation and bias:
-            pytest.skip(f"Rocm does not support fused wgrad accumulation for {dtype}.")
+            pytest.skip(f"ROCm does not support fused wgrad accumulation for {dtype}.")
     config = model_configs[model]
 
     ln_linear_ref = LayerNormLinear(
@@ -1892,7 +1892,7 @@ def test_layernorm_mlp_accuracy_delay_wgrad_compute(
 
     if IS_HIP_EXTENSION:
         if dtype not in (torch.float32,) and fuse_wgrad_accumulation and bias:
-            pytest.skip(f"Rocm does not support fused wgrad accumulation for {dtype}.")
+            pytest.skip(f"ROCm does not support fused wgrad accumulation for {dtype}.")
 
     ln_mlp = LayerNormMLP(
         hidden_size=config.hidden_size,
@@ -2042,7 +2042,7 @@ def test_grouped_linear_accuracy(
 
     if IS_HIP_EXTENSION:
         if dtype not in (torch.float32,) and fuse_wgrad_accumulation and not fp8:
-            pytest.skip(f"Rocm does not support fused wgrad accumulation for {dtype}.")
+            pytest.skip(f"ROCm does not support fused wgrad accumulation for {dtype}.")
     if fp8 and fp8_model_params and NVTE_TEST_NVINSPECT_ENABLED:
         pytest.skip("FP8 parameters are not supported in debug mode.")
 
@@ -2121,6 +2121,8 @@ def test_grouped_linear_accuracy(
     atol, rtol = 0, 0
     if use_cutlass:
         atol, rtol = 1e-3, 1e-3
+        if IS_HIP_EXTENSION and is_mi300_class():
+            atol, rtol = 3e-2, 3e-2
     if use_triton:
         atol, rtol = get_tolerances(dtype)
         if dtype == torch.float32:
@@ -2131,7 +2133,7 @@ def test_grouped_linear_accuracy(
 
 
 @pytest.mark.skipif(
-    torch.cuda.get_device_capability() != (9, 0),
+    torch.cuda.get_device_capability() != (9, 0) and not IS_HIP_EXTENSION,
     reason="Only enable CUTLASS grouped gemm on Hopper",
 )
 @pytest.mark.parametrize("dtype", param_types, ids=str)
@@ -2936,7 +2938,10 @@ def test_grouped_gemm(shape, dtype, layout, accumulate, use_cutlass):
             # cublas implementation should be bit-wise match
             torch.testing.assert_close(o, o_ref, rtol=0, atol=0)
         else:
-            torch.testing.assert_close(o, o_ref, rtol=1.5e-2, atol=1.5e-2)
+            if IS_HIP_EXTENSION and is_mi300_class():
+                torch.testing.assert_close(o, o_ref, rtol=2.0e-2, atol=3.0e-2)
+            else:
+                torch.testing.assert_close(o, o_ref, rtol=1.5e-2, atol=1.5e-2)
 
     if use_cutlass:
         os.environ.pop("NVTE_USE_CUTLASS_GROUPED_GEMM", None)

@@ -203,6 +203,7 @@ else()
        fused_attn_rocm/fused_attn_ck.cpp
        fused_attn_rocm/utils.cpp
        gemm/rocm_gemm.cu
+       gemm/ck_grouped_gemm.cpp
        amd_detail/system.cpp)
 
   # process source code files
@@ -251,6 +252,9 @@ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0)
 else()
   message(FATAL_ERROR "cutlass gemm/cutlass_grouped_gemm.cu kernel required sm 90a")
 endif()
+else()
+  set(CK_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../3rdparty/aiter/3rdparty/composable_kernel)
+  target_include_directories(transformer_engine PRIVATE ${CK_ROOT}/include)
 endif() #USE_CUDA
 
 # Configure dependencies