From 057b34acf48faf3b202a13fc73430c5cba652b7a Mon Sep 17 00:00:00 2001
From: tongke <124763920+tongke6@users.noreply.github.com>
Date: Fri, 12 Jun 2026 11:27:33 +0800
Subject: [PATCH 01/14] Split CUDA extensions by SM architecture for fat-binary
 wheel builds (#83)

Replace the monolithic `cula.cudac` extension with per-arch extensions
(`cula._cudac_sm90`, `cula._cudac_sm100`) so that SM90 and SM100/SM103
kernels are compiled independently with their own `-gencode` flags. This
enables building fat-binary wheels containing all architectures without
needing the target GPU present at build time.

Key changes:
- Split pybind.cu into per-file PYBIND11_MODULE definitions
- Add `cula/cudac.py` proxy module for backwards-compatible imports
- Add `CULA_BUILD_ALL_ARCHS=1` env var to enable all SM targets
- Add `--fat` flag to build_wheel.sh for CI fat-binary builds
- Pin dependency versions and use `no-local-version` scheme for
  reproducible wheel filenames
- Use setuptools_scm for dynamic `__version__`
- Document pre-built wheel installation in README
---
 .github/workflows/build-release.yml | 125 +++++++++++++++++++++++
 README.md                           |  12 +++
 csrc/api/kda_sm100.cu               |   6 ++
 csrc/api/kda_sm90.cu                |   5 +
 csrc/api/pybind.cu                  |  80 ---------------
 cula/__init__.py                    |   5 +-
 cula/cudac.py                       |  65 ++++++++++++
 pyproject.toml                      |   9 +-
 scripts/build_wheel.sh              |  18 +++-
 setup.py                            | 147 ++++++++++++++--------------
 10 files changed, 309 insertions(+), 163 deletions(-)
 create mode 100644 .github/workflows/build-release.yml
 delete mode 100644 csrc/api/pybind.cu
 create mode 100644 cula/cudac.py

diff --git a/.github/workflows/build-release.yml b/.github/workflows/build-release.yml
new file mode 100644
index 00000000..cd5b54d0
--- /dev/null
+++ b/.github/workflows/build-release.yml
@@ -0,0 +1,125 @@
+name: Build & Release Wheels
+
+on:
+  push:
+    tags:
+      - "v*"
+  workflow_dispatch:
+
+concurrency:
+  group: build-release-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  build-wheel:
+    name: "wheel / ${{ matrix.cuda }} / cp312 / ${{ matrix.arch }}"
+    runs-on: ${{ matrix.runner }}
+    strategy:
+      fail-fast: false
+      matrix:
+        cuda:
+          - cu129
+          - cu130
+        arch:
+          - x86_64
+          - aarch64
+        include:
+          - cuda: cu129
+            cuda_version: "12.9.0"
+            torch_index: "https://download.pytorch.org/whl/cu129"
+          - cuda: cu130
+            cuda_version: "13.0.0"
+            torch_index: "https://download.pytorch.org/whl/cu130"
+          - arch: x86_64
+            runner: ubuntu-latest
+          - arch: aarch64
+            runner: ubuntu-24.04-arm
+    container:
+      image: "nvidia/cuda:${{ matrix.cuda_version }}-devel-ubuntu24.04"
+
+    steps:
+      - name: Free disk space
+        run: |
+          rm -rf /opt/hostedtoolcache /usr/local/lib/android /usr/share/dotnet \
+                 /usr/local/share/boost /opt/ghc 2>/dev/null || true
+          apt-get clean 2>/dev/null || true
+          df -h / || true
+
+      - name: Install git
+        run: |
+          apt-get update && apt-get install -y --no-install-recommends git \
+            && rm -rf /var/lib/apt/lists/*
+
+      - name: Checkout
+        uses: actions/checkout@v5
+        with:
+          fetch-depth: 0
+          submodules: recursive
+
+      - name: Configure git safe directory
+        run: git config --global --add safe.directory "$GITHUB_WORKSPACE"
+
+      - name: Set up Python
+        uses: actions/setup-python@v6
+        with:
+          python-version: "3.12"
+
+      - name: Install Python dependencies
+        run: |
+          python -m pip install --no-cache-dir --upgrade pip
+          python -m pip install --no-cache-dir torch --index-url ${{ matrix.torch_index }}
+          python -m pip install --no-cache-dir setuptools wheel "setuptools_scm>=6.0" build ninja
+
+      - name: Compute version
+        id: version
+        run: |
+          if [[ "$GITHUB_REF" == refs/tags/v* ]]; then
+            BASE="${GITHUB_REF#refs/tags/v}"
+          else
+            # Strip any local segment (+gXXX) so we get a clean base
+            BASE=$(python -c "from setuptools_scm import get_version; print(get_version().split('+')[0])")
+          fi
+          echo "version=${BASE}+${{ matrix.cuda }}" >> "$GITHUB_OUTPUT"
+
+      - name: Build fat-binary wheel
+        env:
+          CULA_BUILD_ALL_ARCHS: "1"
+          SETUPTOOLS_SCM_PRETEND_VERSION: "${{ steps.version.outputs.version }}"
+          NVCC_THREADS: "4"
+          MAX_JOBS: "4"
+        run: python -m build --wheel --no-isolation
+
+      - name: Verify wheel
+        run: |
+          echo "Built wheel:"
+          ls -lh dist/*.whl
+          ls dist/*.whl | grep -q "+${{ matrix.cuda }}" \
+            || { echo "ERROR: wheel name missing +${{ matrix.cuda }} suffix"; exit 1; }
+
+      - name: Upload wheel artifact
+        uses: actions/upload-artifact@v6
+        with:
+          name: wheel-${{ matrix.cuda }}-${{ matrix.arch }}
+          path: dist/*.whl
+
+  release:
+    name: Create GitHub Release
+    needs: [build-wheel]
+    runs-on: ubuntu-latest
+    if: startsWith(github.ref, 'refs/tags/v')
+    permissions:
+      contents: write
+    steps:
+      - name: Download all artifacts
+        uses: actions/download-artifact@v6
+        with:
+          path: artifacts/
+
+      - name: Create release
+        uses: softprops/action-gh-release@v3
+        with:
+          files: |
+            artifacts/wheel-*/*.whl
+          generate_release_notes: true
+          draft: true
+          prerelease: ${{ contains(github.ref, 'rc') || contains(github.ref, 'beta') || contains(github.ref, 'alpha') }}
diff --git a/README.md b/README.md
index 7bed61e2..16e8e7b6 100644
--- a/README.md
+++ b/README.md
@@ -24,6 +24,18 @@ cuLA supports both **Hopper (SM90)** and **Blackwell (SM10X)** GPUs.
 
 > **Note:** The PyTorch CUDA version must match your system CUDA Toolkit version. Check with `nvcc --version` and `python -c "import torch; print(torch.version.cuda)"`.
 
+### Pre-built Wheels
+
+Pre-built fat-binary wheels (SM90 + SM100 + SM103) are available on [GitHub Releases](https://github.com/inclusionAI/cuLA/releases):
+
+```bash
+pip install cuda-linear-attention -f https://github.com/inclusionAI/cuLA/releases/expanded_assets/<TAG>
+```
+
+Replace `<TAG>` with the release version (e.g., `v0.2.0`). `pip` will select the wheel matching your local CUDA version. Or download the `.whl` file directly from the [Releases page](https://github.com/inclusionAI/cuLA/releases) and install it with `pip install <filename>.whl`.
+
+### Build from Source
+
 **Clone cuLA & dependencies:**
 
 ```bash
diff --git a/csrc/api/kda_sm100.cu b/csrc/api/kda_sm100.cu
index 7edca370..020d90ca 100644
--- a/csrc/api/kda_sm100.cu
+++ b/csrc/api/kda_sm100.cu
@@ -188,4 +188,10 @@ ChunkKDAFwdRecompWU(
         StaticPersistentTileScheduler::Params{tile_num, params.h_v, params.heads_per_group, params.num_sm, nullptr};
 
     kda::sm100::run_kda_fwd_recomp_w_u_sm100(params, at::cuda::getCurrentCUDAStream());
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.doc() = "cuLA SM100/SM103 kernels";
+    m.def("chunk_kda_fwd_intra_cuda", &ChunkKDAFwdIntra);
+    m.def("recompute_w_u_cuda", &ChunkKDAFwdRecompWU);
 }
\ No newline at end of file
diff --git a/csrc/api/kda_sm90.cu b/csrc/api/kda_sm90.cu
index 9e016eb1..d80df7cc 100644
--- a/csrc/api/kda_sm90.cu
+++ b/csrc/api/kda_sm90.cu
@@ -191,3 +191,8 @@ kda_fwd_prefill(
 
     return {output, output_state};
 }
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.doc() = "cuLA SM90 kernels";
+    m.def("kda_fwd_prefill", &kda_fwd_prefill);
+}
diff --git a/csrc/api/pybind.cu b/csrc/api/pybind.cu
deleted file mode 100644
index d14a41c5..00000000
--- a/csrc/api/pybind.cu
+++ /dev/null
@@ -1,80 +0,0 @@
-// Copyright 2025-2026 Ant Group Co., Ltd.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <torch/nn/functional.h>
-#include <torch/python.h>
-
-#if defined(CULA_SM100_ENABLED) || defined(CULA_SM103_ENABLED)
-void
-ChunkKDAFwdIntra(
-    at::Tensor q,
-    at::Tensor k,
-    at::Tensor g,
-    at::Tensor beta,
-    at::Tensor cu_seqlens,
-    at::Tensor chunk_indices,
-    at::Tensor Aqk_out,
-    at::Tensor Akk_out,
-    at::Tensor tile_counter,
-    float scale,
-    int chunk_size,
-    bool use_tf32_inverse,
-    bool unified_gref);
-void
-ChunkKDAFwdRecompWU(
-    at::Tensor k,
-    at::Tensor v,
-    at::Tensor beta,
-    at::Tensor A,
-    at::Tensor g,
-    at::Tensor cu_seqlens,
-    at::Tensor chunk_indices,
-    at::Tensor w_out,
-    at::Tensor u_out,
-    at::Tensor kg_out,
-    int chunk_size,
-    std::optional<at::Tensor> q,
-    std::optional<at::Tensor> qg_out);
-#endif
-
-#if defined(CULA_SM90A_ENABLED)
-std::tuple<torch::Tensor, std::optional<torch::Tensor>>
-kda_fwd_prefill(
-    std::optional<torch::Tensor> output_,
-    std::optional<torch::Tensor> output_state_,
-    torch::Tensor const& q,
-    torch::Tensor const& k,
-    torch::Tensor const& v,
-    std::optional<torch::Tensor> input_state_,
-    std::optional<torch::Tensor> alpha_,
-    std::optional<torch::Tensor> beta_,
-    torch::Tensor const& cu_seqlens,
-    torch::Tensor workspace_buffer,
-    float scale,
-    bool output_final_state,
-    bool safe_gate);
-#endif
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-    m.doc() = "cuLA";
-#if defined(CULA_SM100_ENABLED) || defined(CULA_SM103_ENABLED)
-    m.def("chunk_kda_fwd_intra_cuda", &ChunkKDAFwdIntra);
-    m.def("recompute_w_u_cuda", &ChunkKDAFwdRecompWU);
-#endif
-#if defined(CULA_SM90A_ENABLED)
-    m.def("kda_fwd_prefill", &kda_fwd_prefill);
-#endif
-}
diff --git a/cula/__init__.py b/cula/__init__.py
index 7272e289..6e13aa13 100644
--- a/cula/__init__.py
+++ b/cula/__init__.py
@@ -12,7 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "0.1.0"
+try:
+    from cula._version import version as __version__
+except ImportError:
+    __version__ = "0.1.0"
 
 from cula.ops.lightning_attn_sm100 import LinearAttentionChunkwiseDecay
 
diff --git a/cula/cudac.py b/cula/cudac.py
new file mode 100644
index 00000000..287d9747
--- /dev/null
+++ b/cula/cudac.py
@@ -0,0 +1,65 @@
+# Copyright 2025-2026 Ant Group Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Unified interface to per-architecture CUDA extensions.
+
+Downstream code can continue to use ``import cula.cudac as cula_cuda``
+and call ``cula_cuda.kda_fwd_prefill(...)`` or
+``cula_cuda.chunk_kda_fwd_intra_cuda(...)`` without knowing which
+extension provides the function.
+"""
+
+import importlib
+import sys
+from types import ModuleType
+
+
+class _CudacProxy(ModuleType):
+    """Lazy proxy that exposes functions from all built arch extensions."""
+
+    def __init__(self):
+        super().__init__(__name__)
+        self.__path__ = []
+        self._modules_loaded = False
+        self._funcs: dict[str, object] = {}
+
+    def _load(self):
+        if self._modules_loaded:
+            return
+        self._modules_loaded = True
+        for ext_name in ("cula._cudac_sm100", "cula._cudac_sm90"):
+            try:
+                mod = importlib.import_module(ext_name)
+                for attr in dir(mod):
+                    if not attr.startswith("_"):
+                        self._funcs[attr] = getattr(mod, attr)
+            except ImportError:
+                pass
+        self.__dict__.update(self._funcs)
+
+    def __getattr__(self, name: str):
+        if name.startswith("_"):
+            raise AttributeError(name)
+        self._load()
+        try:
+            return self._funcs[name]
+        except KeyError:
+            raise AttributeError(f"module 'cula.cudac' has no attribute '{name}'") from None
+
+    def __dir__(self):
+        self._load()
+        return list(self._funcs.keys())
+
+
+sys.modules[__name__] = _CudacProxy()
diff --git a/pyproject.toml b/pyproject.toml
index ef1a531b..a6c34562 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,8 +11,8 @@ readme = "README.md"
 authors = [ { name = "cula contributors" } ]
 requires-python = ">=3.10"
 dependencies = [
-    "nvidia-cutlass-dsl>=4.4.2",
-    "apache-tvm-ffi>=0.1.9",
+    "nvidia-cutlass-dsl==4.4.2",
+    "apache-tvm-ffi==0.1.9",
 ]
 license = { text = "Apache-2.0" }
 
@@ -84,9 +84,6 @@ force-sort-within-sections = false
 "cula/kda/blackwell_fused_fwd.py" = ["F821"]
 
 [tool.setuptools_scm]
-# write generated version into package for runtime access
 write_to = "cula/_version.py"
-# add a date-based local suffix when needed
-local_scheme = "node-and-date"
-# fallback for non-git sources
+local_scheme = "no-local-version"
 fallback_version = "0.1.0"
diff --git a/scripts/build_wheel.sh b/scripts/build_wheel.sh
index 42b35665..79ac3305 100755
--- a/scripts/build_wheel.sh
+++ b/scripts/build_wheel.sh
@@ -18,10 +18,19 @@ cd "$REPO_ROOT"
 
 # Parse args
 ISOLATION_FLAG="--no-isolation"
-if [[ "${1:-}" == "--isolated" ]]; then
-    ISOLATION_FLAG=""
-    echo "[build_wheel] Using isolated build environment"
-else
+for arg in "$@"; do
+    case "$arg" in
+        --isolated)
+            ISOLATION_FLAG=""
+            echo "[build_wheel] Using isolated build environment"
+            ;;
+        --fat)
+            export CULA_BUILD_ALL_ARCHS=1
+            echo "[build_wheel] Fat binary: building for all SM architectures"
+            ;;
+    esac
+done
+if [[ "$ISOLATION_FLAG" == "--no-isolation" ]]; then
     echo "[build_wheel] Using current environment (--no-isolation)"
 fi
 
@@ -33,6 +42,7 @@ rm -rf dist build *.egg-info
 echo "[build_wheel] Python: $(python -V 2>&1)"
 echo "[build_wheel] torch:  $(python -c 'import torch; print(torch.__version__)' 2>/dev/null || echo 'not installed')"
 echo "[build_wheel] CUDA:   $(nvcc --version 2>/dev/null | grep 'release' | sed 's/.*release //' | sed 's/,.*//' || echo 'not found')"
+echo "[build_wheel] Fat binary: ${CULA_BUILD_ALL_ARCHS:-0}"
 
 # Build wheel
 echo "[build_wheel] Building wheel..."
diff --git a/setup.py b/setup.py
index f7b11b95..0e127e81 100644
--- a/setup.py
+++ b/setup.py
@@ -46,13 +46,15 @@ def detect_gpu_archs() -> tuple[bool, bool, bool]:
 def resolve_disable_flag(env_name: str, detected: bool) -> bool:
     """
     Resolve whether to disable a given SM target.
+    - If CULA_BUILD_ALL_ARCHS is set, all targets are enabled unconditionally.
     - If the environment variable is explicitly set, honour it.
     - Otherwise, disable the target when no matching GPU is detected.
     """
+    if os.getenv("CULA_BUILD_ALL_ARCHS", "0") == "1":
+        return False
     env_val = os.getenv(env_name)
     if env_val is not None:
         return env_val.lower() in ["true", "1", "y", "yes"]
-    # Auto-detect: disable if no matching device found
     disable = not detected
     if disable:
         print(f"  No matching GPU detected; auto-setting {env_name}=1 (disable). Set {env_name}=0 to override.")
@@ -66,7 +68,11 @@ def get_features_args():
 
 USE_FAST_MATH = os.getenv("CULA_USE_FAST_MATH", "1") == "1"
 
-print("Detecting GPU architectures...")
+if os.getenv("CULA_BUILD_ALL_ARCHS", "0") == "1":
+    print("CULA_BUILD_ALL_ARCHS=1: enabling all SM targets (sm90a, sm100a, sm103a)")
+else:
+    print("Detecting GPU architectures...")
+
 _has_sm100, _has_sm103, _has_sm90 = detect_gpu_archs()
 DISABLE_SM100 = resolve_disable_flag("CULA_DISABLE_SM100", _has_sm100)
 DISABLE_SM103 = resolve_disable_flag("CULA_DISABLE_SM103", _has_sm103)
@@ -111,26 +117,6 @@ def assert_blackwell_build_env() -> None:
         )
 
 
-def get_arch_flags():
-    major, minor = get_nvcc_version()
-    print(f"Compiling using NVCC {major}.{minor}")
-
-    # Validate Blackwell build environment
-    assert_blackwell_build_env()
-
-    arch_flags = []
-    if not DISABLE_SM100:
-        arch_flags.extend(["-gencode", "arch=compute_100a,code=sm_100a"])
-        arch_flags.extend(["-DCULA_SM100_ENABLED"])
-    if not DISABLE_SM103:
-        arch_flags.extend(["-gencode", "arch=compute_103a,code=sm_103a"])
-        arch_flags.extend(["-DCULA_SM103_ENABLED"])
-    if not DISABLE_SM90:
-        arch_flags.extend(["-gencode", "arch=compute_90a,code=sm_90a"])
-        arch_flags.extend(["-DCULA_SM90A_ENABLED"])
-    return arch_flags
-
-
 def get_nvcc_thread_args():
     nvcc_threads = os.getenv("NVCC_THREADS") or "32"
     return ["--threads", nvcc_threads]
@@ -145,61 +131,78 @@ def get_nvcc_thread_args():
 else:
     cxx_args = ["-O3", "-std=c++20", "-DNDEBUG", "-Wno-deprecated-declarations"]
 
-cuda_sources = [
-    "csrc/api/pybind.cu",
+nvcc_common_args = [
+    "-O3",
+    "-std=c++20",
+    "-DNDEBUG",
+    # "-D_USE_MATH_DEFINES",
+    "-Wno-deprecated-declarations",
+    "-U__CUDA_NO_HALF_OPERATORS__",
+    "-U__CUDA_NO_HALF_CONVERSIONS__",
+    "-U__CUDA_NO_HALF2_OPERATORS__",
+    "-U__CUDA_NO_BFLOAT16_CONVERSIONS__",
+    "--expt-relaxed-constexpr",
+    "--expt-extended-lambda",
+    "-lineinfo",
+    "--ptxas-options=--verbose,--register-usage-level=10,--warn-on-local-memory-usage",
+    "-diag-suppress=3189",
 ]
+
+include_dirs = [
+    Path(this_dir) / "csrc",
+    Path(this_dir) / "csrc" / "kerutils" / "include",
+    Path(this_dir) / "csrc" / "cutlass" / "include",
+    Path(this_dir) / "csrc" / "cutlass" / "tools" / "util" / "include",
+]
+
+major, minor = get_nvcc_version()
+print(f"Compiling using NVCC {major}.{minor}")
+assert_blackwell_build_env()
+
+ext_modules = []
+
 if not DISABLE_SM100 or not DISABLE_SM103:
-    cuda_sources.extend(
-        [
-            "csrc/api/kda_sm100.cu",
-            "csrc/kda/sm100/kda_fwd_sm100.cu",
-        ]
-    )
-if not DISABLE_SM90:
-    cuda_sources.extend(
-        [
-            "csrc/api/kda_sm90.cu",
-            "csrc/kda/sm90/kda_fwd_sm90.cu",
-            "csrc/kda/sm90/kda_fwd_sm90_safe_gate.cu",
-        ]
+    sm100_arch_flags = []
+    if not DISABLE_SM100:
+        sm100_arch_flags.extend(["-gencode", "arch=compute_100a,code=sm_100a", "-DCULA_SM100_ENABLED"])
+    if not DISABLE_SM103:
+        sm100_arch_flags.extend(["-gencode", "arch=compute_103a,code=sm_103a", "-DCULA_SM103_ENABLED"])
+
+    ext_modules.append(
+        CUDAExtension(
+            name="cula._cudac_sm100",
+            sources=[
+                "csrc/api/kda_sm100.cu",
+                "csrc/kda/sm100/kda_fwd_sm100.cu",
+            ],
+            extra_compile_args={
+                "cxx": cxx_args + get_features_args(),
+                "nvcc": nvcc_common_args + get_features_args() + sm100_arch_flags
+                + get_nvcc_thread_args() + (["--use_fast_math"] if USE_FAST_MATH else []),
+            },
+            include_dirs=include_dirs,
+        )
     )
 
-ext_modules = []
-ext_modules.append(
-    CUDAExtension(
-        name="cula.cudac",
-        sources=cuda_sources,
-        extra_compile_args={
-            "cxx": cxx_args + get_features_args(),
-            "nvcc": [
-                "-O3",
-                "-std=c++20",
-                "-DNDEBUG",
-                # "-D_USE_MATH_DEFINES",
-                "-Wno-deprecated-declarations",
-                "-U__CUDA_NO_HALF_OPERATORS__",
-                "-U__CUDA_NO_HALF_CONVERSIONS__",
-                "-U__CUDA_NO_HALF2_OPERATORS__",
-                "-U__CUDA_NO_BFLOAT16_CONVERSIONS__",
-                "--expt-relaxed-constexpr",
-                "--expt-extended-lambda",
-                "-lineinfo",
-                "--ptxas-options=--verbose,--register-usage-level=10,--warn-on-local-memory-usage",
-                "-diag-suppress=3189",  # suppress the warning of torch in C++ 20
-            ]
-            + get_features_args()
-            + get_arch_flags()
-            + get_nvcc_thread_args()
-            + (["--use_fast_math"] if USE_FAST_MATH else []),
-        },
-        include_dirs=[
-            Path(this_dir) / "csrc",
-            Path(this_dir) / "csrc" / "kerutils" / "include",
-            Path(this_dir) / "csrc" / "cutlass" / "include",
-            Path(this_dir) / "csrc" / "cutlass" / "tools" / "util" / "include",
-        ],
+if not DISABLE_SM90:
+    sm90_arch_flags = ["-gencode", "arch=compute_90a,code=sm_90a", "-DCULA_SM90A_ENABLED"]
+
+    ext_modules.append(
+        CUDAExtension(
+            name="cula._cudac_sm90",
+            sources=[
+                "csrc/api/kda_sm90.cu",
+                "csrc/kda/sm90/kda_fwd_sm90.cu",
+                "csrc/kda/sm90/kda_fwd_sm90_safe_gate.cu",
+            ],
+            extra_compile_args={
+                "cxx": cxx_args + get_features_args(),
+                "nvcc": nvcc_common_args + get_features_args() + sm90_arch_flags
+                + get_nvcc_thread_args() + (["--use_fast_math"] if USE_FAST_MATH else []),
+            },
+            include_dirs=include_dirs,
+        )
     )
-)
 
 setup(
     name="cuda-linear-attention",

From 761673708d326bffe3d102aff3e09610e897e088 Mon Sep 17 00:00:00 2001
From: tongke <shunde.csd@antgroup.com>
Date: Fri, 12 Jun 2026 11:38:28 +0800
Subject: [PATCH 02/14] fix ruff lint errors

---
 setup.py          | 14 ++++++++++----
 tests/conftest.py |  7 ++-----
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/setup.py b/setup.py
index 0e127e81..3df211b5 100644
--- a/setup.py
+++ b/setup.py
@@ -177,8 +177,11 @@ def get_nvcc_thread_args():
             ],
             extra_compile_args={
                 "cxx": cxx_args + get_features_args(),
-                "nvcc": nvcc_common_args + get_features_args() + sm100_arch_flags
-                + get_nvcc_thread_args() + (["--use_fast_math"] if USE_FAST_MATH else []),
+                "nvcc": nvcc_common_args
+                + get_features_args()
+                + sm100_arch_flags
+                + get_nvcc_thread_args()
+                + (["--use_fast_math"] if USE_FAST_MATH else []),
             },
             include_dirs=include_dirs,
         )
@@ -197,8 +200,11 @@ def get_nvcc_thread_args():
             ],
             extra_compile_args={
                 "cxx": cxx_args + get_features_args(),
-                "nvcc": nvcc_common_args + get_features_args() + sm90_arch_flags
-                + get_nvcc_thread_args() + (["--use_fast_math"] if USE_FAST_MATH else []),
+                "nvcc": nvcc_common_args
+                + get_features_args()
+                + sm90_arch_flags
+                + get_nvcc_thread_args()
+                + (["--use_fast_math"] if USE_FAST_MATH else []),
             },
             include_dirs=include_dirs,
         )
diff --git a/tests/conftest.py b/tests/conftest.py
index f144c10b..a9338aca 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,4 +1,5 @@
 import re
+
 import pytest
 import torch
 
@@ -56,9 +57,5 @@ def pytest_collection_modifyitems(config, items):
             item.add_marker(skip_slow)
             continue
         callspec = getattr(item, "callspec", None)
-        if (
-            callspec is not None
-            and callspec.params.get("disable_recompute")
-            and "kda_fast_norecomp" not in item.keywords
-        ):
+        if callspec is not None and callspec.params.get("disable_recompute") and "kda_fast_norecomp" not in item.keywords:
             item.add_marker(skip_fast_norecomp)

From b148c526d513062da686aa5c00c30c67590bca16 Mon Sep 17 00:00:00 2001
From: tongke <shunde.csd@antgroup.com>
Date: Fri, 12 Jun 2026 11:52:37 +0800
Subject: [PATCH 03/14] revert version requirements changes

---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index a6c34562..fe93e562 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,8 +11,8 @@ readme = "README.md"
 authors = [ { name = "cula contributors" } ]
 requires-python = ">=3.10"
 dependencies = [
-    "nvidia-cutlass-dsl==4.4.2",
-    "apache-tvm-ffi==0.1.9",
+    "nvidia-cutlass-dsl>=4.4.2",
+    "apache-tvm-ffi>=0.1.9",
 ]
 license = { text = "Apache-2.0" }
 

From 2c56304dc128cb8935238a7566e91528cd9c296e Mon Sep 17 00:00:00 2001
From: tongke <shunde.csd@antgroup.com>
Date: Fri, 12 Jun 2026 12:02:35 +0800
Subject: [PATCH 04/14] Make cudac proxy thread-safe and raise on missing
 extensions

Add double-checked locking to _CudacProxy._load() to prevent race
conditions in multi-threaded environments. Raise a descriptive
ImportError when no CUDA extensions can be loaded instead of silently
producing AttributeError later.
---
 cula/cudac.py | 32 ++++++++++++++++++++++----------
 1 file changed, 22 insertions(+), 10 deletions(-)

diff --git a/cula/cudac.py b/cula/cudac.py
index 287d9747..c256f6ef 100644
--- a/cula/cudac.py
+++ b/cula/cudac.py
@@ -22,6 +22,7 @@
 
 import importlib
 import sys
+import threading
 from types import ModuleType
 
 
@@ -33,20 +34,31 @@ def __init__(self):
         self.__path__ = []
         self._modules_loaded = False
         self._funcs: dict[str, object] = {}
+        self._lock = threading.Lock()
 
     def _load(self):
         if self._modules_loaded:
             return
-        self._modules_loaded = True
-        for ext_name in ("cula._cudac_sm100", "cula._cudac_sm90"):
-            try:
-                mod = importlib.import_module(ext_name)
-                for attr in dir(mod):
-                    if not attr.startswith("_"):
-                        self._funcs[attr] = getattr(mod, attr)
-            except ImportError:
-                pass
-        self.__dict__.update(self._funcs)
+        with self._lock:
+            if self._modules_loaded:
+                return
+            loaded_any = False
+            for ext_name in ("cula._cudac_sm100", "cula._cudac_sm90"):
+                try:
+                    mod = importlib.import_module(ext_name)
+                    for attr in dir(mod):
+                        if not attr.startswith("_"):
+                            self._funcs[attr] = getattr(mod, attr)
+                    loaded_any = True
+                except ImportError:
+                    pass
+            if not loaded_any:
+                raise ImportError(
+                    "None of the cuLA CUDA extensions ('cula._cudac_sm100', 'cula._cudac_sm90') "
+                    "could be imported. Please make sure cuLA is compiled correctly."
+                )
+            self.__dict__.update(self._funcs)
+            self._modules_loaded = True
 
     def __getattr__(self, name: str):
         if name.startswith("_"):

From c955d4762867c7986d58a138edac187fd803e0aa Mon Sep 17 00:00:00 2001
From: yz262713 <yz262713@antgroup.com>
Date: Wed, 17 Jun 2026 19:36:51 +0800
Subject: [PATCH 05/14] Surface per-extension import errors in cudac proxy

The blanket `except ImportError: pass` swallowed the actual failure
reason, making it impossible to diagnose missing shared libraries or
build issues. Collect each extension's ImportError and include them
in the raised message.
---
 cula/cudac.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/cula/cudac.py b/cula/cudac.py
index c256f6ef..8dfcc49b 100644
--- a/cula/cudac.py
+++ b/cula/cudac.py
@@ -43,6 +43,7 @@ def _load(self):
             if self._modules_loaded:
                 return
             loaded_any = False
+            errors: dict[str, Exception] = {}
             for ext_name in ("cula._cudac_sm100", "cula._cudac_sm90"):
                 try:
                     mod = importlib.import_module(ext_name)
@@ -50,12 +51,14 @@ def _load(self):
                         if not attr.startswith("_"):
                             self._funcs[attr] = getattr(mod, attr)
                     loaded_any = True
-                except ImportError:
-                    pass
+                except ImportError as exc:
+                    errors[ext_name] = exc
             if not loaded_any:
+                details = "; ".join(f"{name}: {exc}" for name, exc in errors.items())
                 raise ImportError(
-                    "None of the cuLA CUDA extensions ('cula._cudac_sm100', 'cula._cudac_sm90') "
-                    "could be imported. Please make sure cuLA is compiled correctly."
+                    "None of the cuLA CUDA extensions could be imported. "
+                    f"Per-extension errors: [{details}]. "
+                    "Please make sure cuLA is compiled correctly."
                 )
             self.__dict__.update(self._funcs)
             self._modules_loaded = True

From 6ccb8fac78c7ade3b16ed9f30f1734f55b0b0a8f Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 17 Jun 2026 14:47:35 +0000
Subject: [PATCH 06/14] Fix build-release matrix with DRY expression mapping

---
 .github/workflows/build-release.yml | 17 +++--------------
 1 file changed, 3 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/build-release.yml b/.github/workflows/build-release.yml
index cd5b54d0..ae15a184 100644
--- a/.github/workflows/build-release.yml
+++ b/.github/workflows/build-release.yml
@@ -13,7 +13,7 @@ concurrency:
 jobs:
   build-wheel:
     name: "wheel / ${{ matrix.cuda }} / cp312 / ${{ matrix.arch }}"
-    runs-on: ${{ matrix.runner }}
+    runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-latest' }}
     strategy:
       fail-fast: false
       matrix:
@@ -23,19 +23,8 @@ jobs:
         arch:
           - x86_64
           - aarch64
-        include:
-          - cuda: cu129
-            cuda_version: "12.9.0"
-            torch_index: "https://download.pytorch.org/whl/cu129"
-          - cuda: cu130
-            cuda_version: "13.0.0"
-            torch_index: "https://download.pytorch.org/whl/cu130"
-          - arch: x86_64
-            runner: ubuntu-latest
-          - arch: aarch64
-            runner: ubuntu-24.04-arm
     container:
-      image: "nvidia/cuda:${{ matrix.cuda_version }}-devel-ubuntu24.04"
+      image: "nvidia/cuda:${{ matrix.cuda == 'cu129' && '12.9.0' || '13.0.0' }}-devel-ubuntu24.04"
 
     steps:
       - name: Free disk space
@@ -67,7 +56,7 @@ jobs:
       - name: Install Python dependencies
         run: |
           python -m pip install --no-cache-dir --upgrade pip
-          python -m pip install --no-cache-dir torch --index-url ${{ matrix.torch_index }}
+          python -m pip install --no-cache-dir torch --index-url ${{ matrix.cuda == 'cu129' && 'https://download.pytorch.org/whl/cu129' || 'https://download.pytorch.org/whl/cu130' }}
           python -m pip install --no-cache-dir setuptools wheel "setuptools_scm>=6.0" build ninja
 
       - name: Compute version

From f700810b816c79af0c56351711940d3fc7d8c292 Mon Sep 17 00:00:00 2001
From: tongke <124763920+tongke6@users.noreply.github.com>
Date: Wed, 17 Jun 2026 22:51:28 +0800
Subject: [PATCH 07/14] Apply suggestions from code review

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 cula/cudac.py | 4 +++-
 setup.py      | 4 ++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/cula/cudac.py b/cula/cudac.py
index 8dfcc49b..1bbaf108 100644
--- a/cula/cudac.py
+++ b/cula/cudac.py
@@ -77,4 +77,6 @@ def __dir__(self):
         return list(self._funcs.keys())
 
 
-sys.modules[__name__] = _CudacProxy()
+_proxy = _CudacProxy()
+_proxy.__dict__.update({k: globals().get(k) for k in ("__spec__", "__file__", "__package__", "__loader__")})
+sys.modules[__name__] = _proxy
diff --git a/setup.py b/setup.py
index 3df211b5..78c61e5c 100644
--- a/setup.py
+++ b/setup.py
@@ -164,9 +164,9 @@ def get_nvcc_thread_args():
 if not DISABLE_SM100 or not DISABLE_SM103:
     sm100_arch_flags = []
     if not DISABLE_SM100:
-        sm100_arch_flags.extend(["-gencode", "arch=compute_100a,code=sm_100a", "-DCULA_SM100_ENABLED"])
+        sm100_arch_flags.extend(["-gencode", "arch=compute_100a,code=sm_100a"])
     if not DISABLE_SM103:
-        sm100_arch_flags.extend(["-gencode", "arch=compute_103a,code=sm_103a", "-DCULA_SM103_ENABLED"])
+        sm100_arch_flags.extend(["-gencode", "arch=compute_103a,code=sm_103a"])
 
     ext_modules.append(
         CUDAExtension(

From e0a1e214179fd5e53dcfe5bb2e114909e104e91b Mon Sep 17 00:00:00 2001
From: tongke <124763920+tongke6@users.noreply.github.com>
Date: Wed, 17 Jun 2026 23:06:57 +0800
Subject: [PATCH 08/14] Potential fix for pull request finding

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 README.md | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 16e8e7b6..eabf435b 100644
--- a/README.md
+++ b/README.md
@@ -28,11 +28,9 @@ cuLA supports both **Hopper (SM90)** and **Blackwell (SM10X)** GPUs.
 
 Pre-built fat-binary wheels (SM90 + SM100 + SM103) are available on [GitHub Releases](https://github.com/inclusionAI/cuLA/releases):
 
-```bash
-pip install cuda-linear-attention -f https://github.com/inclusionAI/cuLA/releases/expanded_assets/<TAG>
-```
+    pip install "cuda-linear-attention==<VERSION>+<CUDA_TAG>" -f https://github.com/inclusionAI/cuLA/releases/expanded_assets/<TAG>
 
-Replace `<TAG>` with the release version (e.g., `v0.2.0`). `pip` will select the wheel matching your local CUDA version. Or download the `.whl` file directly from the [Releases page](https://github.com/inclusionAI/cuLA/releases) and install it with `pip install <filename>.whl`.
+Replace `<TAG>` with the release tag (e.g., `v0.2.0`), `<VERSION>` with the base version (e.g., `0.2.0`), and `<CUDA_TAG>` with your PyTorch CUDA build tag (e.g., `cu129` or `cu130`). Or download the `.whl` file directly from the [Releases page](https://github.com/inclusionAI/cuLA/releases) and install it with `pip install <filename>.whl`.
 
 ### Build from Source
 

From ead4f7eb5204194854f8f3a7dbddca03a24dedbb Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 17 Jun 2026 15:14:59 +0000
Subject: [PATCH 09/14] Add README example for building fat wheels

---
 README.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/README.md b/README.md
index eabf435b..7502e302 100644
--- a/README.md
+++ b/README.md
@@ -57,6 +57,12 @@ pip install -e third_party/flash-linear-attention
 pip install -e . --no-build-isolation
 ```
 
+**Build fat wheel (SM90 + SM100 + SM103):**
+
+```bash
+CULA_BUILD_ALL_ARCHS=1 python -m build --wheel --no-isolation
+```
+
 ## Quick Start
 
 ### KDA (Kimi Delta Attention) — Blackwell (SM10X)

From 75f171066e94be3e42f4c59c0745e762a691f704 Mon Sep 17 00:00:00 2001
From: tongke <shunde.csd@antgroup.com>
Date: Mon, 22 Jun 2026 19:13:26 +0800
Subject: [PATCH 10/14] Surface partial-extension failures in cudac proxy

Catch (ImportError, AttributeError, OSError) when scanning per-arch
extensions: pybind11 modules commonly surface missing-symbol / ABI /
libcudart failures as AttributeError or OSError rather than
ImportError, so the prior narrow catch silently dropped one extension's
failure when another succeeded, leaving its kernels missing without
diagnostic. Emit a UserWarning naming each failing extension on
partial failure (all-fail still raises ImportError), preserving the
c955d47 intent of surfacing per-extension errors. Also document the
load-once-per-process semantics in the module docstring.
---
 cula/cudac.py | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/cula/cudac.py b/cula/cudac.py
index 1bbaf108..23f96c97 100644
--- a/cula/cudac.py
+++ b/cula/cudac.py
@@ -18,11 +18,19 @@
 and call ``cula_cuda.kda_fwd_prefill(...)`` or
 ``cula_cuda.chunk_kda_fwd_intra_cuda(...)`` without knowing which
 extension provides the function.
+
+Loading is **once per process**: the first attribute access triggers a
+single threaded scan of every built ``cula._cudac_sm*`` extension; the
+discovered callables are then cached on the module instance and no
+further re-scan happens. Installing or rebuilding an extension after a
+process has already imported ``cula.cudac`` will therefore not be picked
+up -- callers that need a freshly built extension must restart Python.
 """
 
 import importlib
 import sys
 import threading
+import warnings
 from types import ModuleType
 
 
@@ -44,6 +52,10 @@ def _load(self):
                 return
             loaded_any = False
             errors: dict[str, Exception] = {}
+            # pybind11 extensions surface missing-symbol / ABI / libcudart
+            # failures as AttributeError or OSError at import time rather
+            # than ImportError, so catch the broader set to keep matching
+            # the c955d47 intent of surfacing every per-extension failure.
             for ext_name in ("cula._cudac_sm100", "cula._cudac_sm90"):
                 try:
                     mod = importlib.import_module(ext_name)
@@ -51,7 +63,7 @@ def _load(self):
                         if not attr.startswith("_"):
                             self._funcs[attr] = getattr(mod, attr)
                     loaded_any = True
-                except ImportError as exc:
+                except (ImportError, AttributeError, OSError) as exc:
                     errors[ext_name] = exc
             if not loaded_any:
                 details = "; ".join(f"{name}: {exc}" for name, exc in errors.items())
@@ -60,6 +72,16 @@ def _load(self):
                     f"Per-extension errors: [{details}]. "
                     "Please make sure cuLA is compiled correctly."
                 )
+            # Partial failures are not fatal (each surviving extension is
+            # usable), but the user still needs to know which kernel sets
+            # are missing so they can diagnose a partial / mismatched build.
+            if errors:
+                details = "; ".join(f"{name}: {exc}" for name, exc in errors.items())
+                warnings.warn(
+                    "Some cuLA CUDA extensions could not be imported and their "
+                    f"kernels are unavailable. Per-extension errors: [{details}].",
+                    stacklevel=2,
+                )
             self.__dict__.update(self._funcs)
             self._modules_loaded = True
 

From ceab48b5f83866bb869bda0864bd7a1580557497 Mon Sep 17 00:00:00 2001
From: tongke <shunde.csd@antgroup.com>
Date: Mon, 22 Jun 2026 20:00:03 +0800
Subject: [PATCH 11/14] Build release wheels against manylinux_2_28

---
 .github/workflows/build-release.yml | 29 +++++++++++++++++++++++------
 README.md                           |  4 ++--
 2 files changed, 25 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/build-release.yml b/.github/workflows/build-release.yml
index ae15a184..e2cdd5cf 100644
--- a/.github/workflows/build-release.yml
+++ b/.github/workflows/build-release.yml
@@ -24,20 +24,21 @@ jobs:
           - x86_64
           - aarch64
     container:
-      image: "nvidia/cuda:${{ matrix.cuda == 'cu129' && '12.9.0' || '13.0.0' }}-devel-ubuntu24.04"
+      # UBI 8 provides the glibc 2.28 baseline required by manylinux_2_28.
+      image: "nvidia/cuda:${{ matrix.cuda == 'cu129' && '12.9.0' || '13.0.0' }}-devel-ubi8"
 
     steps:
       - name: Free disk space
         run: |
           rm -rf /opt/hostedtoolcache /usr/local/lib/android /usr/share/dotnet \
                  /usr/local/share/boost /opt/ghc 2>/dev/null || true
-          apt-get clean 2>/dev/null || true
+          dnf clean all 2>/dev/null || true
           df -h / || true
 
       - name: Install git
         run: |
-          apt-get update && apt-get install -y --no-install-recommends git \
-            && rm -rf /var/lib/apt/lists/*
+          dnf install -y git
+          dnf clean all
 
       - name: Checkout
         uses: actions/checkout@v5
@@ -57,7 +58,7 @@ jobs:
         run: |
           python -m pip install --no-cache-dir --upgrade pip
           python -m pip install --no-cache-dir torch --index-url ${{ matrix.cuda == 'cu129' && 'https://download.pytorch.org/whl/cu129' || 'https://download.pytorch.org/whl/cu130' }}
-          python -m pip install --no-cache-dir setuptools wheel "setuptools_scm>=6.0" build ninja
+          python -m pip install --no-cache-dir setuptools wheel "setuptools_scm>=6.0" build ninja auditwheel patchelf
 
       - name: Compute version
         id: version
@@ -76,7 +77,20 @@ jobs:
           SETUPTOOLS_SCM_PRETEND_VERSION: "${{ steps.version.outputs.version }}"
           NVCC_THREADS: "4"
           MAX_JOBS: "4"
-        run: python -m build --wheel --no-isolation
+        run: python -m build --wheel --no-isolation --outdir dist-raw
+
+      - name: Repair wheel for manylinux_2_28
+        run: |
+          # These libraries are supplied by the NVIDIA driver, PyTorch, or
+          # PyTorch's CUDA runtime dependency and must remain external.
+          python -m auditwheel repair \
+            --plat manylinux_2_28_${{ matrix.arch }} \
+            --exclude libcuda.so.1 \
+            --exclude 'libcudart.so.*' \
+            --exclude 'libc10*.so' \
+            --exclude 'libtorch*.so' \
+            --wheel-dir dist \
+            dist-raw/*.whl
 
       - name: Verify wheel
         run: |
@@ -84,6 +98,9 @@ jobs:
           ls -lh dist/*.whl
           ls dist/*.whl | grep -q "+${{ matrix.cuda }}" \
             || { echo "ERROR: wheel name missing +${{ matrix.cuda }} suffix"; exit 1; }
+          ls dist/*.whl | grep -q "manylinux_2_28_${{ matrix.arch }}" \
+            || { echo "ERROR: wheel is not tagged manylinux_2_28_${{ matrix.arch }}"; exit 1; }
+          python -m auditwheel show dist/*.whl
 
       - name: Upload wheel artifact
         uses: actions/upload-artifact@v6
diff --git a/README.md b/README.md
index 7502e302..09418811 100644
--- a/README.md
+++ b/README.md
@@ -26,7 +26,7 @@ cuLA supports both **Hopper (SM90)** and **Blackwell (SM10X)** GPUs.
 
 ### Pre-built Wheels
 
-Pre-built fat-binary wheels (SM90 + SM100 + SM103) are available on [GitHub Releases](https://github.com/inclusionAI/cuLA/releases):
+Pre-built fat-binary wheels (SM90 + SM100 + SM103) are available on [GitHub Releases](https://github.com/inclusionAI/cuLA/releases). Linux wheels target `manylinux_2_28` and require glibc 2.28 or newer:
 
     pip install "cuda-linear-attention==<VERSION>+<CUDA_TAG>" -f https://github.com/inclusionAI/cuLA/releases/expanded_assets/<TAG>
 
@@ -255,4 +255,4 @@ No CUDA experience is required as long as you're a quick learner.
 For Q&A and discussion, you can join us through:
 
 - **Slack:** [cuLA Slack Community](https://join.slack.com/t/cula-hq/shared_invite/zt-3uaacvm9y-xJwZyGueeKxZRYQlj7~hxw)
-- **WeChat:** The WeChat group has exceeded 200 members and can no longer be joined via QR code. To join, please send your WeChat ID to any of the following emails and we'll invite you: **chaofanyu@gmail.com** / **kevinzz08@foxmail.com** / **yzpag@gmail.com** / **haoc80996@gmail.com**. You can also ask someone already in the group to invite you directly.
\ No newline at end of file
+- **WeChat:** The WeChat group has exceeded 200 members and can no longer be joined via QR code. To join, please send your WeChat ID to any of the following emails and we'll invite you: **chaofanyu@gmail.com** / **kevinzz08@foxmail.com** / **yzpag@gmail.com** / **haoc80996@gmail.com**. You can also ask someone already in the group to invite you directly.

From 430531b272cae3b112c5eec5711e5975a0aa4617 Mon Sep 17 00:00:00 2001
From: tongke <shunde.csd@antgroup.com>
Date: Mon, 22 Jun 2026 20:35:57 +0800
Subject: [PATCH 12/14] fix python 3.12 GLIBC compat problems on ubi8

---
 .github/workflows/build-release.yml | 23 +++++++++--------------
 1 file changed, 9 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/build-release.yml b/.github/workflows/build-release.yml
index e2cdd5cf..88686f0f 100644
--- a/.github/workflows/build-release.yml
+++ b/.github/workflows/build-release.yml
@@ -35,9 +35,9 @@ jobs:
           dnf clean all 2>/dev/null || true
           df -h / || true
 
-      - name: Install git
+      - name: Install system dependencies
         run: |
-          dnf install -y git
+          dnf install -y git python3.12 python3.12-devel python3.12-pip
           dnf clean all
 
       - name: Checkout
@@ -49,16 +49,11 @@ jobs:
       - name: Configure git safe directory
         run: git config --global --add safe.directory "$GITHUB_WORKSPACE"
 
-      - name: Set up Python
-        uses: actions/setup-python@v6
-        with:
-          python-version: "3.12"
-
       - name: Install Python dependencies
         run: |
-          python -m pip install --no-cache-dir --upgrade pip
-          python -m pip install --no-cache-dir torch --index-url ${{ matrix.cuda == 'cu129' && 'https://download.pytorch.org/whl/cu129' || 'https://download.pytorch.org/whl/cu130' }}
-          python -m pip install --no-cache-dir setuptools wheel "setuptools_scm>=6.0" build ninja auditwheel patchelf
+          python3.12 -m pip install --no-cache-dir --upgrade pip
+          python3.12 -m pip install --no-cache-dir torch --index-url ${{ matrix.cuda == 'cu129' && 'https://download.pytorch.org/whl/cu129' || 'https://download.pytorch.org/whl/cu130' }}
+          python3.12 -m pip install --no-cache-dir setuptools wheel "setuptools_scm>=6.0" build ninja auditwheel patchelf
 
       - name: Compute version
         id: version
@@ -67,7 +62,7 @@ jobs:
             BASE="${GITHUB_REF#refs/tags/v}"
           else
             # Strip any local segment (+gXXX) so we get a clean base
-            BASE=$(python -c "from setuptools_scm import get_version; print(get_version().split('+')[0])")
+            BASE=$(python3.12 -c "from setuptools_scm import get_version; print(get_version().split('+')[0])")
           fi
           echo "version=${BASE}+${{ matrix.cuda }}" >> "$GITHUB_OUTPUT"
 
@@ -77,13 +72,13 @@ jobs:
           SETUPTOOLS_SCM_PRETEND_VERSION: "${{ steps.version.outputs.version }}"
           NVCC_THREADS: "4"
           MAX_JOBS: "4"
-        run: python -m build --wheel --no-isolation --outdir dist-raw
+        run: python3.12 -m build --wheel --no-isolation --outdir dist-raw
 
       - name: Repair wheel for manylinux_2_28
         run: |
           # These libraries are supplied by the NVIDIA driver, PyTorch, or
           # PyTorch's CUDA runtime dependency and must remain external.
-          python -m auditwheel repair \
+          python3.12 -m auditwheel repair \
             --plat manylinux_2_28_${{ matrix.arch }} \
             --exclude libcuda.so.1 \
             --exclude 'libcudart.so.*' \
@@ -100,7 +95,7 @@ jobs:
             || { echo "ERROR: wheel name missing +${{ matrix.cuda }} suffix"; exit 1; }
           ls dist/*.whl | grep -q "manylinux_2_28_${{ matrix.arch }}" \
             || { echo "ERROR: wheel is not tagged manylinux_2_28_${{ matrix.arch }}"; exit 1; }
-          python -m auditwheel show dist/*.whl
+          python3.12 -m auditwheel show dist/*.whl
 
       - name: Upload wheel artifact
         uses: actions/upload-artifact@v6

From a27443ce17d0a5eed710ae82696446a730d04b9c Mon Sep 17 00:00:00 2001
From: tongke <shunde.csd@antgroup.com>
Date: Mon, 22 Jun 2026 20:47:40 +0800
Subject: [PATCH 13/14] install gcc13

---
 .github/workflows/build-release.yml | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build-release.yml b/.github/workflows/build-release.yml
index 88686f0f..72e593c5 100644
--- a/.github/workflows/build-release.yml
+++ b/.github/workflows/build-release.yml
@@ -14,6 +14,9 @@ jobs:
   build-wheel:
     name: "wheel / ${{ matrix.cuda }} / cp312 / ${{ matrix.arch }}"
     runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-latest' }}
+    defaults:
+      run:
+        shell: bash
     strategy:
       fail-fast: false
       matrix:
@@ -37,7 +40,13 @@ jobs:
 
       - name: Install system dependencies
         run: |
-          dnf install -y git python3.12 python3.12-devel python3.12-pip
+          dnf install -y \
+            git \
+            gcc-toolset-13-gcc \
+            gcc-toolset-13-gcc-c++ \
+            python3.12 \
+            python3.12-devel \
+            python3.12-pip
           dnf clean all
 
       - name: Checkout
@@ -68,11 +77,17 @@ jobs:
 
       - name: Build fat-binary wheel
         env:
+          CC: /opt/rh/gcc-toolset-13/root/usr/bin/gcc
+          CXX: /opt/rh/gcc-toolset-13/root/usr/bin/g++
+          CUDAHOSTCXX: /opt/rh/gcc-toolset-13/root/usr/bin/g++
           CULA_BUILD_ALL_ARCHS: "1"
           SETUPTOOLS_SCM_PRETEND_VERSION: "${{ steps.version.outputs.version }}"
           NVCC_THREADS: "4"
           MAX_JOBS: "4"
-        run: python3.12 -m build --wheel --no-isolation --outdir dist-raw
+        run: |
+          "$CC" --version
+          "$CXX" --version
+          python3.12 -m build --wheel --no-isolation --outdir dist-raw
 
       - name: Repair wheel for manylinux_2_28
         run: |

From 44ab4013d9e7d14d12e1a29852e78e0b1ab9c08a Mon Sep 17 00:00:00 2001
From: tongke <shunde.csd@antgroup.com>
Date: Mon, 22 Jun 2026 22:52:36 +0800
Subject: [PATCH 14/14] Load CUDA extension matching current GPU architecture

Select the per-architecture CUDA extension from the active device compute
capability instead of scanning every built extension. SM100/SM103 now load
the SM100 extension, while SM90 loads the SM90 extension.

This avoids exposing kernels from mismatched GPU architectures and reports
clearer errors when the matching extension is missing or unsupported.
---
 cula/cudac.py | 74 +++++++++++++++++++++++++--------------------------
 1 file changed, 36 insertions(+), 38 deletions(-)

diff --git a/cula/cudac.py b/cula/cudac.py
index 23f96c97..28fb5f38 100644
--- a/cula/cudac.py
+++ b/cula/cudac.py
@@ -19,23 +19,41 @@
 ``cula_cuda.chunk_kda_fwd_intra_cuda(...)`` without knowing which
 extension provides the function.
 
-Loading is **once per process**: the first attribute access triggers a
-single threaded scan of every built ``cula._cudac_sm*`` extension; the
-discovered callables are then cached on the module instance and no
-further re-scan happens. Installing or rebuilding an extension after a
-process has already imported ``cula.cudac`` will therefore not be picked
-up -- callers that need a freshly built extension must restart Python.
+Loading is **once per process**: the first attribute access checks the
+currently active CUDA device, imports the matching ``cula._cudac_sm*``
+extension, and caches the discovered callables on the module instance.
+Changing the active CUDA device to a different architecture after a
+process has already loaded ``cula.cudac`` will therefore not be picked
+up -- callers that need a different extension must restart Python.
 """
 
 import importlib
 import sys
 import threading
-import warnings
 from types import ModuleType
 
 
+def _current_device_extension() -> tuple[str, str]:
+    try:
+        import torch
+    except ImportError as exc:
+        raise ImportError("cuLA CUDA extensions require PyTorch to detect the current GPU.") from exc
+
+    if not torch.cuda.is_available():
+        raise RuntimeError("cuLA CUDA extensions require a visible CUDA GPU, but torch.cuda.is_available() is False.")
+
+    device = torch.cuda.current_device()
+    prop = torch.cuda.get_device_properties(device)
+    sm_label = f"sm_{prop.major}{prop.minor}"
+    if prop.major == 10 and prop.minor in (0, 3):
+        return "cula._cudac_sm100", sm_label
+    if prop.major == 9 and prop.minor == 0:
+        return "cula._cudac_sm90", sm_label
+    raise RuntimeError(f"Unsupported CUDA compute capability {sm_label}. Supported architectures: sm_100, sm_103, sm_90.")
+
+
 class _CudacProxy(ModuleType):
-    """Lazy proxy that exposes functions from all built arch extensions."""
+    """Lazy proxy that exposes functions from the current GPU arch extension."""
 
     def __init__(self):
         super().__init__(__name__)
@@ -50,38 +68,18 @@ def _load(self):
         with self._lock:
             if self._modules_loaded:
                 return
-            loaded_any = False
-            errors: dict[str, Exception] = {}
-            # pybind11 extensions surface missing-symbol / ABI / libcudart
-            # failures as AttributeError or OSError at import time rather
-            # than ImportError, so catch the broader set to keep matching
-            # the c955d47 intent of surfacing every per-extension failure.
-            for ext_name in ("cula._cudac_sm100", "cula._cudac_sm90"):
-                try:
-                    mod = importlib.import_module(ext_name)
-                    for attr in dir(mod):
-                        if not attr.startswith("_"):
-                            self._funcs[attr] = getattr(mod, attr)
-                    loaded_any = True
-                except (ImportError, AttributeError, OSError) as exc:
-                    errors[ext_name] = exc
-            if not loaded_any:
-                details = "; ".join(f"{name}: {exc}" for name, exc in errors.items())
+            ext_name, sm_label = _current_device_extension()
+            try:
+                mod = importlib.import_module(ext_name)
+                for attr in dir(mod):
+                    if not attr.startswith("_"):
+                        self._funcs[attr] = getattr(mod, attr)
+            except (ImportError, AttributeError, OSError) as exc:
                 raise ImportError(
-                    "None of the cuLA CUDA extensions could be imported. "
-                    f"Per-extension errors: [{details}]. "
+                    f"The cuLA CUDA extension for the current GPU ({sm_label}) could not be imported. "
+                    f"Extension {ext_name} failed with: {exc}. "
                     "Please make sure cuLA is compiled correctly."
-                )
-            # Partial failures are not fatal (each surviving extension is
-            # usable), but the user still needs to know which kernel sets
-            # are missing so they can diagnose a partial / mismatched build.
-            if errors:
-                details = "; ".join(f"{name}: {exc}" for name, exc in errors.items())
-                warnings.warn(
-                    "Some cuLA CUDA extensions could not be imported and their "
-                    f"kernels are unavailable. Per-extension errors: [{details}].",
-                    stacklevel=2,
-                )
+                ) from exc
             self.__dict__.update(self._funcs)
             self._modules_loaded = True