diff --git a/gmm2.py b/gmm2.py
new file mode 100644
index 000000000..938d69247
--- /dev/null
+++ b/gmm2.py
@@ -0,0 +1,90 @@
+import os
+import time
+import torch
+import transformer_engine.pytorch as te
+
+torch.manual_seed(0)
+
+os.environ["NVTE_USE_CUTLASS_GROUPED_GEMM"] = "1"
+os.environ["NVTE_CUTLASS_GROUPED_GEMM_WARN_FALLBACK"] = "1"
+
+device = "cuda"
+dtype  = torch.bfloat16
+
+E = 4
+K = 1024
+N = 2048
+m_splits = [128, 64, 0, 256]
+M_total = sum(m_splits)
+
+x = torch.randn(M_total, K, device=device, dtype=dtype)
+
+# Timing helper
+def bench_cuda(fn, warmup=20, iters=100):
+    # Warmup
+    for _ in range(warmup):
+        fn()
+    torch.cuda.synchronize()
+
+    # Timed
+    start = time.time()
+    for _ in range(iters):
+        fn()
+    torch.cuda.synchronize()
+    end = time.time()
+
+    avg_ms = (end - start) * 1000.0 / iters
+    return avg_ms
+
+# TE GroupedLinear
+glinear = te.GroupedLinear(E, K, N, bias=False).to(device=device, dtype=dtype)
+
+def te_run():
+    return glinear(x, m_splits=m_splits)
+
+te_ms = bench_cuda(te_run, warmup=20, iters=100)
+
+# Grab weights for reference path
+Ws = [getattr(glinear, f"weight{e}") for e in range(E)]  # each [N, K]
+W = torch.stack(Ws, dim=0)                               # [E, N, K]
+assert W.shape == (E, N, K), f"Unexpected weight shape: {W.shape}"
+
+# Torch reference (group loop)
+offsets = []
+off = 0
+for m in m_splits:
+    offsets.append(off)
+    off += m
+
+y_ref_buf = torch.empty((M_total, N), device=device, dtype=dtype)
+
+def torch_run():
+    # Fill the preallocated buffer
+    for e, m in enumerate(m_splits):
+        if m == 0:
+            continue
+        o = offsets[e]
+        y_ref_buf[o:o+m].copy_(x[o:o+m] @ W[e].transpose(0, 1))
+    return y_ref_buf
+
+torch_ms = bench_cuda(torch_run, warmup=20, iters=100)
+
+# Compare outputs
+y_te = te_run()
+y_ref = torch_run().clone()
+
+diff = (y_te.float() - y_ref.float())
+max_abs = diff.abs().max().item()
+rel = (diff.abs() / (y_ref.float().abs() + 1e-6)).max().item()
+
+print(f"Errors:")
+print(f"  {y_te.shape=}, {y_ref.shape=}")
+print("  max_abs_err:", max_abs)
+print("  max_rel_err:", rel)
+
+torch.testing.assert_close(y_te.float(), y_ref.float(), rtol=3e-2, atol=3e-2)
+
+print(f"\nTiming:")
+print(f"  TE avg:    {te_ms:.3f} ms")
+print(f"  Torch avg: {torch_ms:.3f} ms")
+print(f"  Speedup:   {torch_ms/te_ms:.2f}x (Torch / TE)")
diff --git a/tests/pytorch/test_numerics.py b/tests/pytorch/test_numerics.py
index bc29d29e3..bd71e0a50 100644
--- a/tests/pytorch/test_numerics.py
+++ b/tests/pytorch/test_numerics.py
@@ -28,7 +28,7 @@
     is_bf16_compatible,
 )
 if IS_HIP_EXTENSION:
-    from transformer_engine.pytorch.utils import is_mi200, is_mi308
+    from transformer_engine.pytorch.utils import is_mi200, is_mi308, is_mi300_class
 
 from transformer_engine.pytorch import (
     DotProductAttention,
@@ -148,7 +148,7 @@ def rocm_attn_backend() -> tuple[bool, bool, bool]:
 
 use_cutlass_grouped_gemm = [False]
 # Only enable cutlass grouped gemm on Hopper
-if torch.cuda.get_device_capability() == (9, 0):
+if torch.cuda.get_device_capability() == (9, 0) or IS_HIP_EXTENSION:
     use_cutlass_grouped_gemm.append(True)
 
 
@@ -1386,7 +1386,7 @@ def test_linear_accuracy_delay_wgrad_compute(dtype, bs, model, bias, fuse_wgrad_
 
     if IS_HIP_EXTENSION:
         if dtype not in (torch.float32,) and fuse_wgrad_accumulation and bias:
-            pytest.skip(f"Rocm does not support fused wgrad accumulation for {dtype}.")
+            pytest.skip(f"ROCm does not support fused wgrad accumulation for {dtype}.")
 
     te_linear_ref = Linear(
         config.hidden_size,
@@ -1678,7 +1678,7 @@ def test_layernorm_linear_accuracy_delay_wgrad_compute(
 ):
     if IS_HIP_EXTENSION:
         if dtype not in (torch.float32,) and fuse_wgrad_accumulation and bias:
-            pytest.skip(f"Rocm does not support fused wgrad accumulation for {dtype}.")
+            pytest.skip(f"ROCm does not support fused wgrad accumulation for {dtype}.")
     config = model_configs[model]
 
     ln_linear_ref = LayerNormLinear(
@@ -1892,7 +1892,7 @@ def test_layernorm_mlp_accuracy_delay_wgrad_compute(
 
     if IS_HIP_EXTENSION:
         if dtype not in (torch.float32,) and fuse_wgrad_accumulation and bias:
-            pytest.skip(f"Rocm does not support fused wgrad accumulation for {dtype}.")
+            pytest.skip(f"ROCm does not support fused wgrad accumulation for {dtype}.")
 
     ln_mlp = LayerNormMLP(
         hidden_size=config.hidden_size,
@@ -2042,7 +2042,7 @@ def test_grouped_linear_accuracy(
 
     if IS_HIP_EXTENSION:
         if dtype not in (torch.float32,) and fuse_wgrad_accumulation and not fp8:
-            pytest.skip(f"Rocm does not support fused wgrad accumulation for {dtype}.")
+            pytest.skip(f"ROCm does not support fused wgrad accumulation for {dtype}.")
     if fp8 and fp8_model_params and NVTE_TEST_NVINSPECT_ENABLED:
         pytest.skip("FP8 parameters are not supported in debug mode.")
 
@@ -2121,6 +2121,8 @@ def test_grouped_linear_accuracy(
     atol, rtol = 0, 0
     if use_cutlass:
         atol, rtol = 1e-3, 1e-3
+        if IS_HIP_EXTENSION and is_mi300_class():
+            atol, rtol = 3e-2, 3e-2
     if use_triton:
         atol, rtol = get_tolerances(dtype)
         if dtype == torch.float32:
@@ -2131,7 +2133,7 @@ def test_grouped_linear_accuracy(
 
 
 @pytest.mark.skipif(
-    torch.cuda.get_device_capability() != (9, 0),
+    torch.cuda.get_device_capability() != (9, 0) and not IS_HIP_EXTENSION,
     reason="Only enable CUTLASS grouped gemm on Hopper",
 )
 @pytest.mark.parametrize("dtype", param_types, ids=str)
@@ -2936,7 +2938,10 @@ def test_grouped_gemm(shape, dtype, layout, accumulate, use_cutlass):
             # cublas implementation should be bit-wise match
             torch.testing.assert_close(o, o_ref, rtol=0, atol=0)
         else:
-            torch.testing.assert_close(o, o_ref, rtol=1.5e-2, atol=1.5e-2)
+            if IS_HIP_EXTENSION and is_mi300_class():
+                torch.testing.assert_close(o, o_ref, rtol=2.0e-2, atol=3.0e-2)
+            else:
+                torch.testing.assert_close(o, o_ref, rtol=1.5e-2, atol=1.5e-2)
 
     if use_cutlass:
         os.environ.pop("NVTE_USE_CUTLASS_GROUPED_GEMM", None)
diff --git a/transformer_engine/common/CMakeLists.txt b/transformer_engine/common/CMakeLists.txt
index cefec6d06..4a04a630f 100644
--- a/transformer_engine/common/CMakeLists.txt
+++ b/transformer_engine/common/CMakeLists.txt
@@ -203,6 +203,7 @@ else()
        fused_attn_rocm/fused_attn_ck.cpp
        fused_attn_rocm/utils.cpp
        gemm/rocm_gemm.cu
+       gemm/ck_grouped_gemm.cpp
        amd_detail/system.cpp)
 
   # process source code files
@@ -251,6 +252,9 @@ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0)
 else()
   message(FATAL_ERROR "cutlass gemm/cutlass_grouped_gemm.cu kernel required sm 90a")
 endif()
+else()
+  set(CK_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../3rdparty/aiter/3rdparty/composable_kernel)
+  target_include_directories(transformer_engine PRIVATE ${CK_ROOT}/include)
 endif() #USE_CUDA
 
 # Configure dependencies
diff --git a/transformer_engine/common/gemm/ck_grouped_gemm.cpp b/transformer_engine/common/gemm/ck_grouped_gemm.cpp
new file mode 100644
index 000000000..4c3e0264a
--- /dev/null
+++ b/transformer_engine/common/gemm/ck_grouped_gemm.cpp
@@ -0,0 +1,273 @@
+/* Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved. */
+
+#include <hip/hip_runtime.h>
+
+#include <transformer_engine/transformer_engine.h>
+#include "../common.h"
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/epilogue.hpp"
+#include "ck_tile/ops/gemm.hpp"
+
+using RowMajor = ck_tile::tensor_layout::gemm::RowMajor;
+using ColMajor = ck_tile::tensor_layout::gemm::ColumnMajor;
+
+template <typename TeScalar> struct TeTypeToCkType;
+template <> struct TeTypeToCkType<transformer_engine::fp16> { using type = ck_tile::half_t; };
+template <> struct TeTypeToCkType<transformer_engine::bf16> { using type = ck_tile::bfloat16_t; };
+
+// Treat TE tensors as generalized 2D matrices by flattening:
+// (D1, D2, ..., Dn) -> (D1*...*D(n-1), Dn), consistent with TE Tensor::flat_*_dim.
+static inline bool get_flat_2d_dims(const transformer_engine::Tensor& t,
+                                   int64_t& d0, int64_t& d1) {
+  // Require at least a matrix (rank >= 2). Higher ranks are flattened.
+  if (t.shape().size() < 2)
+    return false;
+  d0 = static_cast<int64_t>(t.flat_first_dim());
+  d1 = static_cast<int64_t>(t.flat_last_dim());
+  return true;
+}
+
+static inline const transformer_engine::SimpleTensor& data_view(const transformer_engine::Tensor& t) {
+  return t.data; // rowwise data view
+}
+
+struct TileCfg_basic {
+  static constexpr ck_tile::index_t M_Tile = 256;
+  static constexpr ck_tile::index_t N_Tile = 128;
+  static constexpr ck_tile::index_t K_Tile = 64;
+
+  static constexpr ck_tile::index_t M_Warp = 2;
+  static constexpr ck_tile::index_t N_Warp = 2;
+  static constexpr ck_tile::index_t K_Warp = 1;
+
+  static constexpr ck_tile::index_t M_Warp_Tile = 32;
+  static constexpr ck_tile::index_t N_Warp_Tile = 32;
+  static constexpr ck_tile::index_t K_Warp_Tile = 16;
+
+  static constexpr bool kPadM = true;
+  static constexpr bool kPadN = true;
+  static constexpr bool kPadK = true;
+
+  static constexpr bool DoubleSmemBuffer = false;
+
+  static constexpr ck_tile::index_t TilePartitionerGroupNum = 8;
+  static constexpr ck_tile::index_t TilePartitionerM01      = 1;
+};
+
+// This class instantiates CK_Tile's grouped GEMM pipeline.
+// See e.g. https://github.com/ROCm/composable_kernel/blob/develop/example/ck_tile/03_gemm/universal_gemm_invoker.hpp for reference.
+template <typename AType, typename BType, typename CType,
+          typename ALayout, typename BLayout, typename CLayout,
+          typename TileCfg, ck_tile::memory_operation_enum MemOp,
+          typename AccType = float>
+struct Runner{
+  using GemmShape = ck_tile::TileGemmShape<
+      ck_tile::sequence<TileCfg::M_Tile, TileCfg::N_Tile, TileCfg::K_Tile>,
+      ck_tile::sequence<TileCfg::M_Warp, TileCfg::N_Warp, TileCfg::K_Warp>,
+      ck_tile::sequence<TileCfg::M_Warp_Tile, TileCfg::N_Warp_Tile, TileCfg::K_Warp_Tile>>;
+
+  using Partitioner = ck_tile::GemmSpatiallyLocalTilePartitioner<
+      GemmShape, TileCfg::TilePartitionerGroupNum, TileCfg::TilePartitionerM01>;
+
+  using UniversalTraits = ck_tile::PersistentTileGemmUniversalTraits<
+      TileCfg::kPadM, TileCfg::kPadN, TileCfg::kPadK,
+      TileCfg::DoubleSmemBuffer, ALayout, BLayout, CLayout>;
+
+  static constexpr ck_tile::GemmPipelineScheduler Scheduler =
+      ck_tile::GemmPipelineScheduler::Intrawave;
+
+  using Problem = ck_tile::UniversalGemmPipelineProblem<
+      AType, BType, AccType, GemmShape, UniversalTraits, Scheduler>;
+
+  using Pipeline = ck_tile::GemmPipelineAgBgCrCompV3<Problem>;
+
+  using Epilogue = ck_tile::CShuffleEpilogue<
+      ck_tile::CShuffleEpilogueProblem<
+          AType, BType, ck_tile::tuple<>, AccType,
+          CType, ck_tile::tuple<>, CLayout,
+          ck_tile::element_wise::PassThrough,
+          Partitioner::MPerBlock, Partitioner::NPerBlock,
+          TileCfg::M_Warp, TileCfg::N_Warp,
+          TileCfg::M_Warp_Tile, TileCfg::N_Warp_Tile, TileCfg::K_Warp_Tile,
+          Problem::TransposeC, MemOp>>;
+
+  using Kernel = ck_tile::GroupedGemmKernel<Partitioner, Pipeline, Epilogue>;
+};
+
+template <typename T, typename ALayout, typename BLayout, typename CLayout,
+          ck_tile::memory_operation_enum MemOp>
+static bool run_grouped_impl(const transformer_engine::Tensor* const* A_use,
+                             const transformer_engine::Tensor* const* B_use,
+                             transformer_engine::Tensor* const* D,
+                             int group_num,
+                             bool transA_use,
+                             bool transB_use,
+                             void* workspace,
+                             size_t workspace_bytes,
+                             hipStream_t stream)
+{
+  using Kernel = typename Runner<T, T, T, ALayout, BLayout, CLayout, TileCfg_basic, MemOp>::Kernel;
+
+  const size_t needed = Kernel::GetWorkSpaceSize(group_num);
+  if (!workspace || workspace_bytes < needed) {
+    NVTE_ERROR("ck_tile_grouped_gemm: insufficient workspace. Needed bytes=", needed);
+    return false;
+  }
+
+  std::vector<ck_tile::GroupedGemmHostArgs<0>> descs;
+  descs.reserve(group_num);
+
+  for (int i = 0; i < group_num; ++i) {
+    const auto& a = data_view(*A_use[i]);
+    const auto& b = data_view(*B_use[i]);
+    const auto& d = data_view(*D[i]);
+
+    int64_t Ad0 = 0, Ad1 = 0, Bd0 = 0, Bd1 = 0, Dd0 = 0, Dd1 = 0;
+    if (!get_flat_2d_dims(*A_use[i], Ad0, Ad1) ||
+        !get_flat_2d_dims(*B_use[i], Bd0, Bd1) ||
+        !get_flat_2d_dims(*D[i],     Dd0, Dd1)) {
+      NVTE_ERROR("ck_tile_grouped_gemm: expected all groups to be rank>=2 (2D or higher).");
+      return false;
+    }
+
+    const int64_t M  = transA_use ? Ad1 : Ad0;
+    const int64_t K  = transA_use ? Ad0 : Ad1;
+    const int64_t N  = transB_use ? Bd0 : Bd1;
+    const int64_t Kb = transB_use ? Bd1 : Bd0;
+
+    if (Kb != K) {
+      NVTE_ERROR("ck_tile_grouped_gemm: K mismatch between A and B in group ", i);
+      return false;
+    }
+
+    if (Dd0 != M || Dd1 != N) {
+      NVTE_ERROR("ck_tile_grouped_gemm: D shape mismatch in group ", i);
+      return false;
+    }
+
+    // Leading dimensions under the flattened-contiguous interpretation
+    const ck_tile::index_t stride_A = Ad1;
+    const ck_tile::index_t stride_B = Bd1;
+    const ck_tile::index_t stride_E = Dd1;
+
+    descs.emplace_back(
+        a.dptr,
+        b.dptr,
+        std::array<const void*, 0>{},
+        d.dptr,
+        1,
+        M,
+        N,
+        K,
+        stride_A,
+        stride_B,
+        std::array<ck_tile::index_t, 0>{},
+        stride_E);
+  }
+
+  const dim3 grids = Kernel::GridSize(descs);
+  auto kargs = Kernel::MakeKargs(descs);
+  if (!Kernel::IsSupportedArgument(kargs)) {
+    NVTE_ERROR("ck_tile_grouped_gemm: CK_Tile kernel arguments not supported for this config.");
+    return false;
+  }
+
+  HIP_CHECK_ERROR(hipMemcpyAsync(workspace,
+                                kargs.data(),
+                                kargs.size() * sizeof(typename decltype(kargs)::value_type),
+                                hipMemcpyHostToDevice,
+                                stream));
+
+  const ck_tile::stream_config s{stream};
+  const dim3 blocks = Kernel::BlockSize();
+
+  ck_tile::launch_kernel(
+      s,
+      ck_tile::make_kernel<1>(
+          Kernel{}, grids, blocks, 0,
+          ck_tile::cast_pointer_to_constant_address_space(workspace),
+          group_num));
+  return true;
+}
+
+template <typename T, typename CLayout, ck_tile::memory_operation_enum MemOp>
+static inline bool dispatch_grouped(bool transA_use,
+                                    bool transB_use,
+                                    const transformer_engine::Tensor* const* A_use,
+                                    const transformer_engine::Tensor* const* B_use,
+                                    transformer_engine::Tensor* const* D,
+                                    int group_num,
+                                    void* workspace,
+                                    size_t workspace_bytes,
+                                    hipStream_t stream) {
+
+// FIXME: This could be a templated lambda function in C++20.
+#define CALL(ALayout_, BLayout_, ta_, tb_)                                    \
+  return run_grouped_impl<T, ALayout_, BLayout_, CLayout, MemOp>(             \
+      A_use, B_use, D, group_num, (ta_), (tb_), workspace, workspace_bytes, stream)
+  
+  if (!transA_use && !transB_use) { CALL(RowMajor, RowMajor, false, false); }
+  if (!transA_use &&  transB_use) { CALL(RowMajor, ColMajor, false, true ); }
+  if ( transA_use && !transB_use) { CALL(ColMajor, RowMajor, true,  false); }
+  /* transA_use && transB_use */  { CALL(ColMajor, ColMajor, true,  true ); }
+
+#undef CALL
+}
+
+bool ck_tile_grouped_gemm(const NVTETensor* A,
+                          const NVTETensor* B,
+                          NVTETensor* D,
+                          int group_num,
+                          bool transA,
+                          bool transB,
+                          NVTETensor* workspace,
+                          bool accumulate,
+                          hipStream_t stream)
+{
+  if (group_num <= 0)
+    return true;
+
+  // Convert A/B/D arrays into TE Tensor arrays
+  std::vector<const transformer_engine::Tensor*> A_te(group_num);
+  std::vector<const transformer_engine::Tensor*> B_te(group_num);
+  std::vector<transformer_engine::Tensor*>       D_te(group_num);
+
+  for (int i = 0; i < group_num; ++i) {
+    A_te[i] = transformer_engine::convertNVTETensorCheck(A[i]);
+    B_te[i] = transformer_engine::convertNVTETensorCheck(B[i]);
+    D_te[i] = transformer_engine::convertNVTETensorCheck(D[i]);
+  }
+
+  // Workspace pointer + bytes
+  void*  ws_ptr   = nullptr;
+  size_t ws_bytes = 0;
+  if (workspace) {
+    auto* ws_te = transformer_engine::convertNVTETensorCheck(*workspace);
+    ws_ptr   = ws_te->data.dptr;
+    ws_bytes = ws_te->data.numel() *
+               transformer_engine::typeToSize(ws_te->data.dtype);
+  }
+
+  // Normalize similar to upstream
+  // See https://github.com/NVIDIA/TransformerEngine/blob/59f6f3876767d07045152bfae07b5dd4c54e1725/transformer_engine/common/gemm/cutlass_grouped_gemm.cu#L54-L68
+  const transformer_engine::Tensor* const* A_use = B_te.data();
+  const transformer_engine::Tensor* const* B_use = A_te.data();
+  const bool transA_use = transB;
+  const bool transB_use = transA;
+
+  const auto a_dtype = A_use[0]->dtype();
+
+  TRANSFORMER_ENGINE_TYPE_SWITCH_16BIT(a_dtype, te_type, {
+    using T = typename TeTypeToCkType<te_type>::type;
+
+    if (accumulate)
+      return dispatch_grouped<T, RowMajor, ck_tile::memory_operation_enum::atomic_add>(transA_use, transB_use,
+                                     A_use, B_use, D_te.data(), group_num,
+                                     ws_ptr, ws_bytes, stream);
+    else
+      return dispatch_grouped<T, RowMajor, ck_tile::memory_operation_enum::set>(transA_use, transB_use,
+                                     A_use, B_use, D_te.data(), group_num,
+                                     ws_ptr, ws_bytes, stream);
+  });
+}
diff --git a/transformer_engine/common/gemm/ck_grouped_gemm.h b/transformer_engine/common/gemm/ck_grouped_gemm.h
new file mode 100644
index 000000000..d539b47f7
--- /dev/null
+++ b/transformer_engine/common/gemm/ck_grouped_gemm.h
@@ -0,0 +1,11 @@
+/* Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved. */
+
+bool ck_tile_grouped_gemm(const NVTETensor* A,
+                          const NVTETensor* B,
+                          NVTETensor* D,
+                          int group_num,
+                          bool transA,
+                          bool transB,
+                          NVTETensor* workspace,
+                          bool accumulate,
+                          hipStream_t stream);
diff --git a/transformer_engine/common/gemm/cublaslt_gemm.cu b/transformer_engine/common/gemm/cublaslt_gemm.cu
index 9c2ca9b4c..bb322233b 100644
--- a/transformer_engine/common/gemm/cublaslt_gemm.cu
+++ b/transformer_engine/common/gemm/cublaslt_gemm.cu
@@ -1,6 +1,6 @@
 /*************************************************************************
  * This file was modified for portability to AMDGPU
- * Copyright (c) 2022-2025, Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2022-2026, Advanced Micro Devices, Inc. All rights reserved.
  * Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  *
  * See LICENSE for license information.
@@ -24,8 +24,11 @@
 #include "../util/logging.h"
 #include "../util/multi_stream.h"
 #include "common/util/cuda_runtime.h"
+#include "common/util/system.h"
 #ifndef __HIP_PLATFORM_AMD__
 #include "cutlass_grouped_gemm.cuh"
+#else
+#include "ck_grouped_gemm.h"
 #endif
 
 #ifndef __HIP_PLATFORM_AMD__
@@ -788,9 +791,10 @@ void nvte_multi_tensor_gemm(const NVTETensor *A, const NVTETensor *B, NVTETensor
   NVTE_API_CALL(nvte_multi_tensor_gemm);
 
 #ifdef __HIP_PLATFORM_AMD__
-    multi_stream_cublas_gemm(A, B, D, bias, pre_gelu_out, num_gemms, transa, transb, grad,
-                             workspace, accumulate, use_split_accumulator, math_sm_count, stream);
-#else
+  if (num_gemms <= 0)
+    return;
+#endif
+
   const int current_device = transformer_engine::cuda::current_device();
   const bool is_hopper = (transformer_engine::cuda::sm_arch(current_device) == 90);
   const bool use_cutlass = transformer_engine::getenv<bool>("NVTE_USE_CUTLASS_GROUPED_GEMM", false);
@@ -803,7 +807,11 @@ void nvte_multi_tensor_gemm(const NVTETensor *A, const NVTETensor *B, NVTETensor
   };
 
   // Currently only support cutlass group gemm on Hopper Arch
+#ifdef __HIP_PLATFORM_AMD__
+  if (!use_cutlass || num_gemms == 1) {
+#else
   if (!(is_hopper && use_cutlass)) {
+#endif
     cublas_path();
     return;
   }
@@ -837,12 +845,21 @@ void nvte_multi_tensor_gemm(const NVTETensor *A, const NVTETensor *B, NVTETensor
     auto *inputA = transformer_engine::convertNVTETensorCheck(A[0]);
     auto *inputB = transformer_engine::convertNVTETensorCheck(B[0]);
     auto *OutputD = transformer_engine::convertNVTETensorCheck(D[0]);
+#ifdef __HIP_PLATFORM_AMD__
+    auto A_dt = inputA->data.dtype;
+    auto B_dt = inputB->data.dtype;
+    auto D_dt = OutputD->data.dtype;
+    return (A_dt == B_dt) && (A_dt == D_dt) &&
+           (A_dt == transformer_engine::DType::kFloat16 ||
+            A_dt == transformer_engine::DType::kBFloat16);
+#else
     auto A_type = get_cuda_dtype(inputA->data.dtype);
     auto B_type = get_cuda_dtype(inputB->data.dtype);
     auto D_type = get_cuda_dtype(OutputD->data.dtype);
 
     return (A_type == B_type) && (A_type == D_type) &&
            ((A_type == CUDA_R_16BF) || (A_type == CUDA_R_16F));
+#endif
   };
 
   // CUTLASS Grouped GEMM fast path (SM90/TMA)
@@ -855,14 +872,18 @@ void nvte_multi_tensor_gemm(const NVTETensor *A, const NVTETensor *B, NVTETensor
   //
   // Otherwise, fall back to cuBLAS.
   if (is_empty_arr(bias) && is_empty_arr(pre_gelu_out) && is_supported_dtype() &&
+#ifdef __HIP_PLATFORM_AMD__
+      true)                               {
+    ck_tile_grouped_gemm(A, B, D, num_gemms, transa, transb, workspace, accumulate, stream);
+#else
       all_groups_uniform_k128(B, transb)) {
     cutlass_grouped_gemm(A, B, D, num_gemms, transa, transb, grad, workspace, accumulate,
                          current_device, math_sm_count, stream);
+#endif
   } else {
     if (warn_fallback) {
       NVTE_WARN("Fallback to cuBLAS grouped GEMM.");
     }
     cublas_path();
   }
-#endif  // __HIP_PLATFORM_AMD__
 }
diff --git a/transformer_engine/pytorch/utils.py b/transformer_engine/pytorch/utils.py
index d124fbeaf..92253ad9c 100644
--- a/transformer_engine/pytorch/utils.py
+++ b/transformer_engine/pytorch/utils.py
@@ -1,5 +1,5 @@
 # This file was modified for portability to AMDGPU
-# Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (c) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 # Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # See LICENSE for license information.
@@ -456,6 +456,14 @@ def is_mi308():
       import re
       return (re.search('AMD Instinct MI308', torch.cuda.get_device_name(torch.cuda.current_device())) is not None)
 
+    def is_mi300_class():
+        """check whether the current device is of the gfx942 class"""
+        return get_device_compute_capability() == (9, 4)
+
+    def is_mi350_class():
+        """check whether the current device is of the gfx950 class"""
+        return get_device_compute_capability() == (9, 5)
+
 @functools.lru_cache(maxsize=None)
 def is_fp8_fnuz():
     return IS_HIP_EXTENSION and get_device_compute_capability() == (9, 4)