fanshiqing · fanshiqing · Jun 9, 2025 · Jun 9, 2025 · Jun 10, 2025 · Jun 10, 2025
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -18,8 +18,8 @@ jobs:
       - name: 'Dependencies'
         run: |
           apt-get update
-          apt-get install -y git python3.9 pip ninja-build cudnn9-cuda-12
-          pip install cmake==3.21.0
+          apt-get install -y git python3.9 pip cudnn9-cuda-12
+          pip install cmake==3.21.0 pybind11[global] ninja
       - name: 'Checkout'
         uses: actions/checkout@v3
         with:
@@ -42,8 +42,8 @@ jobs:
       - name: 'Dependencies'
         run: |
           apt-get update
-          apt-get install -y git python3.9 pip ninja-build cudnn9-cuda-12
-          pip install cmake torch pydantic importlib-metadata>=1.0 packaging pybind11 numpy einops
+          apt-get install -y git python3.9 pip cudnn9-cuda-12
+          pip install cmake torch ninja pydantic importlib-metadata>=1.0 packaging pybind11 numpy einops
       - name: 'Checkout'
         uses: actions/checkout@v3
         with:
@@ -62,6 +62,8 @@ jobs:
       image: ghcr.io/nvidia/jax:jax
       options: --user root
     steps:
+      - name: 'Dependencies'
+        run: pip install pybind11[global]
       - name: 'Checkout'
         uses: actions/checkout@v3
         with:
@@ -87,7 +89,7 @@ jobs:
         with:
           submodules: recursive
       - name: 'Build'
-        run: pip install --no-build-isolation . -v
+        run: pip install --no-build-isolation . -v --no-deps
         env:
           NVTE_FRAMEWORK: all
           MAX_JOBS: 1

diff --git a/3rdparty/cudnn-frontend b/3rdparty/cudnn-frontend
diff --git a/README.rst b/README.rst
@@ -216,13 +216,13 @@ Alternatively, install directly from the GitHub repository:
 
 .. code-block:: bash
 
-    pip install git+https://github.com/NVIDIA/TransformerEngine.git@stable
+    pip install --no-build-isolation git+https://github.com/NVIDIA/TransformerEngine.git@stable
 
 When installing from GitHub, you can explicitly specify frameworks using the environment variable:
 
 .. code-block:: bash
 
-    NVTE_FRAMEWORK=pytorch,jax pip install git+https://github.com/NVIDIA/TransformerEngine.git@stable
+    NVTE_FRAMEWORK=pytorch,jax pip install --no-build-isolation git+https://github.com/NVIDIA/TransformerEngine.git@stable
 
 conda Installation
 ^^^^^^^^^^^^^^^^^^

diff --git a/benchmarks/linear/benchmark_grouped_linear.py b/benchmarks/linear/benchmark_grouped_linear.py
@@ -0,0 +1,241 @@
+# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+import argparse
+import torch
+import torch.utils.benchmark as benchmark
+import pandas as pd
+import pathlib
+
+from transformer_engine.pytorch.module import GroupedLinear
+from transformer_engine.common.recipe import Float8BlockScaling
+from transformer_engine.pytorch.fp8 import fp8_autocast
+from contextlib import nullcontext
+
+RECIPES = {
+    "bf16": None,
+    "fp8_sub_channel": Float8BlockScaling(),
+}
+
+
+def run_linear_multiple_steps(layer, x, m_splits, mode, gradient, run_num_steps=1, recipe=None):
+    assert mode in ["fwd_only", "fwd_bwd"]
+    fp8_context = (
+        fp8_autocast(enabled=True, fp8_recipe=recipe) if recipe is not None else nullcontext()
+    )
+    # print(f"fp8_context: {fp8_context} and is it nullcontext? {isinstance(fp8_context, nullcontext)}")
+
+    if mode == "fwd_only":
+        with torch.no_grad(), fp8_context:
+            for i in range(run_num_steps):
+                y_q = layer.forward(
+                    x,
+                    m_splits,
+                    is_first_microbatch=(i == 0),
+                )
+        return y_q
+    else:
+        # reset gradients
+        layer.zero_grad()
+        x.grad = None
+
+        with fp8_context:
+            for i in range(run_num_steps):
+                label = f"step_{i}"
+                torch.cuda.nvtx.range_push(label)
+                y_q = layer.forward(
+                    x,
+                    m_splits,
+                    is_first_microbatch=(i == 0),
+                )
+                y_q.backward(gradient)
+                torch.cuda.nvtx.range_pop()
+
+        grads_q = []
+        grads_q.append(x.grad)
+        # remaining derivatives are in respect to model parameters
+        for p in layer.parameters():
+            if p.requires_grad:
+                grads_q.append(p.grad)
+
+        return y_q, grads_q
+
+
+def benchmark_linear(
+    x,
+    ws,
+    m_splits,
+    bias,
+    recipe_name,
+    mode,
+    num_gemms=4,
+):
+    params_dtype = torch.bfloat16
+    recipe = RECIPES[recipe_name]
+
+    in_features = x.shape[1]
+    out_features = ws[0].shape[0]
+    gradient = torch.ones((x.shape[0], out_features), dtype=torch.bfloat16, device=x.device)
+
+    layer = GroupedLinear(
+        num_gemms,
+        in_features,
+        out_features,
+        bias=bias is not None,
+        params_dtype=params_dtype,
+    )
+
+    layer = layer.to("cuda")
+    with torch.no_grad():
+        for i in range(num_gemms):
+            weight_i = getattr(layer, f"weight{i}")
+            weight_i.copy_(ws[i])
+            if bias is not None:
+                bias_i = getattr(layer, f"bias{i}")
+                bias_i.copy_(bias)
+
+    num_microbatches = 32
+
+    label = f"{recipe_name}_{'grouped'}"
+    torch.cuda.nvtx.range_push(label)
+    timing = benchmark.Timer(
+        stmt=(
+            "run_linear_multiple_steps(layer, x, m_splits, mode, gradient, num_microbatches,"
+            " recipe)"
+        ),
+        globals={
+            "run_linear_multiple_steps": run_linear_multiple_steps,
+            "layer": layer,
+            "x": x,
+            "m_splits": m_splits,
+            "mode": mode,
+            "gradient": gradient,
+            "num_microbatches": num_microbatches,
+            "recipe": recipe,
+        },
+        num_threads=1,
+    ).blocked_autorange(min_run_time=5)
+    print(f"{recipe_name}: {timing} \n")
+    timing_ms = timing.median * 1000 / num_microbatches
+
+    return timing_ms
+
+
+def run_benchmark_linear(mkns, recipe_name, use_bias, num_gemms=4):
+    data = []
+    assert not use_bias, "Bias is not supported for GroupedLinear benchmark"
+
+    print(f"========== Benchmarking {recipe_name} ==========")
+    for m, k, n in mkns:
+        device = "cuda"
+        x = torch.randn((m, k), dtype=torch.bfloat16, device=device, requires_grad=True)
+        ws = [torch.randn((n, k), dtype=torch.bfloat16, device=device) for _ in range(num_gemms)]
+        assert m % num_gemms == 0
+        m_splits = [m // num_gemms] * num_gemms
+        # Bias is not supported for GroupedLinear benchmark
+        bias = None
+
+        # Run the benchmark
+        print(f"fwd_m={m}, fwd_k={k}, fwd_n={n}")
+
+        grouped_fwd_bwd_timing_ms = benchmark_linear(
+            x,
+            ws,
+            m_splits,
+            bias,
+            recipe_name,
+            mode="fwd_bwd",
+            num_gemms=num_gemms,
+        )
+
+        # Append the results
+        data.append(
+            [
+                m,
+                k,
+                n,
+                recipe_name,
+                num_gemms,
+                grouped_fwd_bwd_timing_ms,
+            ]
+        )
+
+    df = pd.DataFrame(
+        data=data,
+        columns=[
+            "m",
+            "k",
+            "n",
+            "recipe",
+            "num_gemms",
+            "grouped_fwd_bwd_time_ms",
+        ],
+    )
+
+    print(df, "\n")
+    return df
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--profile", action="store_true", help="Enable profiling mode")
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="benchmark_output/",
+        help="output path for report",
+    )
+    args = parser.parse_args()
+
+    use_bias = False
+    # Set the MKN values to benchmark
+    mkns = []
+    for m in [1024]:
+        # for m in [4096, 8192, 16384]:
+        # for n in [1024, 2048, 4096, 8192, 16384]:
+        for n in [3072]:
+            for k in [4096]:
+                mkns.append((m, k, n))
+
+    # recipe_list = [
+    #     "bf16", "fp8_sub_channel",
+    # ]
+    recipe_list = [
+        "fp8_sub_channel",
+    ]
+
+    # num_gemms_list = [16, 32]
+    num_gemms_list = [4]
+
+    if args.profile:
+        # nsys profile --output=./benchmarks/linear/mkn_4096_4096_4096_numgemm_1_bf16 --trace=cuda,nvtx,cudnn,cublas python benchmarks/linear/benchmark_grouped_linear.py --profile
+        # nsys profile --output=./benchmarks/linear/mkn_8192_8192_8192_numgemm_32_bf16 --trace=cuda,nvtx,cudnn,cublas python benchmarks/linear/benchmark_grouped_linear.py --profile
+        # nsys profile --output=./benchmarks/linear/mkn_4096_4096_4096_numgemm_8_fp8_sub_channel --trace=cuda,nvtx,cudnn,cublas python benchmarks/linear/benchmark_grouped_linear.py --profile
+        # nsys profile --output=./benchmarks/linear/mkn_8192_8192_8192_numgemm_2_fp8_sub_channel --trace=cuda,nvtx,cudnn,cublas python benchmarks/linear/benchmark_grouped_linear.py --profile
+        mkns = [(4096, 4096, 4096)]
+        recipe_list = ["fp8_sub_channel"]
+        # recipe_list = ["bf16"]
+        num_gemms_list = [8]
+        torch.autograd.profiler.emit_nvtx(record_shapes=True).__enter__()
+
+    # Initialize a dataframe to store the results
+    df_linears = pd.DataFrame()
+
+    # Run the fp8 benchmarks
+    for num_gemms in num_gemms_list:
+        print(f"========== Benchmarking with num_gemms={num_gemms} ==========")
+        for recipe_name in recipe_list:
+            df = run_benchmark_linear(
+                mkns,
+                recipe_name,
+                use_bias,
+                num_gemms=num_gemms,
+            )
+            df_linears = pd.concat([df_linears, df])
+
+    print(df_linears)
+
+    if args.profile:
+        torch.autograd.profiler.emit_nvtx().__exit__(None, None, None)
diff --git a/build_tools/VERSION.txt b/build_tools/VERSION.txt
@@ -1 +1 @@
-2.5.0.dev0
+2.5.0
diff --git a/build_tools/jax.py b/build_tools/jax.py
@@ -4,7 +4,6 @@
 
 """JAX related extensions."""
 import os
-import shutil
 from pathlib import Path
 
 import setuptools
@@ -13,6 +12,16 @@
 from typing import List
 
 
+def install_requirements() -> List[str]:
+    """Install dependencies for TE/JAX extensions."""
+    return ["jax", "flax>=0.7.1"]
+
+
+def test_requirements() -> List[str]:
+    """Test dependencies for TE/JAX extensions."""
+    return ["numpy"]
+
+
 def xla_path() -> str:
     """XLA root path lookup.
     Throws FileNotFoundError if XLA source is not found."""
@@ -66,20 +75,9 @@ def setup_jax_extension(
     # Define TE/JAX as a Pybind11Extension
     from pybind11.setup_helpers import Pybind11Extension
 
-    class Pybind11CPPExtension(Pybind11Extension):
-        """Modified Pybind11Extension to allow custom CXX flags."""
-
-        def _add_cflags(self, flags: List[str]) -> None:
-            if isinstance(self.extra_compile_args, dict):
-                cxx_flags = self.extra_compile_args.pop("cxx", [])
-                cxx_flags += flags
-                self.extra_compile_args["cxx"] = cxx_flags
-            else:
-                self.extra_compile_args[:0] = flags
-
-    return Pybind11CPPExtension(
+    return Pybind11Extension(
         "transformer_engine_jax",
         sources=[str(path) for path in sources],
         include_dirs=[str(path) for path in include_dirs],
-        extra_compile_args={"cxx": cxx_flags},
+        extra_compile_args=cxx_flags,
     )
diff --git a/build_tools/pytorch.py b/build_tools/pytorch.py
@@ -9,6 +9,22 @@
 import setuptools
 
 from .utils import all_files_in_dir, cuda_version, get_cuda_include_dirs, debug_build_enabled
+from typing import List
+
+
+def install_requirements() -> List[str]:
+    """Install dependencies for TE/JAX extensions."""
+    reqs = ["torch>=2.1", "einops"]
+    reqs.append(
+        "nvdlfw-inspect @"
+        " git+https://github.com/NVIDIA/nvidia-dlfw-inspect.git@v0.1#egg=nvdlfw-inspect"
+    )
+    return reqs
+
+
+def test_requirements() -> List[str]:
+    """Test dependencies for TE/JAX extensions."""
+    return ["numpy", "torchvision", "transformers"]
 
 
 def setup_pytorch_extension(

diff --git a/build_tools/utils.py b/build_tools/utils.py
@@ -354,10 +354,3 @@ def copy_common_headers(
         new_path = dst_dir / path.relative_to(src_dir)
         new_path.parent.mkdir(exist_ok=True, parents=True)
         shutil.copy(path, new_path)
-
-
-def install_and_import(package):
-    """Install a package via pip (if not already installed) and import into globals."""
-    main_package = package.split("[")[0]
-    subprocess.check_call([sys.executable, "-m", "pip", "install", package])
-    globals()[main_package] = importlib.import_module(main_package)
+20 −0		benchmark/sdpa_benchmark_bf16_training/Dockerfile
+127 −0		benchmark/sdpa_benchmark_bf16_training/README.md
+63 −0		benchmark/sdpa_benchmark_bf16_training/artifacts/sample_b200_run.txt
+41 −0		benchmark/sdpa_benchmark_bf16_training/artifacts/sdpa_benchmark_results_NVIDIA_B200.csv
+ −		benchmark/sdpa_benchmark_bf16_training/artifacts/sdpa_benchmark_results_NVIDIA_B200.png
+357 −0		benchmark/sdpa_benchmark_bf16_training/benchmark_sdpa.py
+535 −0		benchmark/sdpa_benchmark_bf16_training/benchmark_single_sdpa.py
+8 −3		include/cudnn_frontend/node/scaled_dot_product_flash_attention.h