From 057b34acf48faf3b202a13fc73430c5cba652b7a Mon Sep 17 00:00:00 2001 From: tongke <124763920+tongke6@users.noreply.github.com> Date: Fri, 12 Jun 2026 11:27:33 +0800 Subject: [PATCH 01/14] Split CUDA extensions by SM architecture for fat-binary wheel builds (#83) Replace the monolithic `cula.cudac` extension with per-arch extensions (`cula._cudac_sm90`, `cula._cudac_sm100`) so that SM90 and SM100/SM103 kernels are compiled independently with their own `-gencode` flags. This enables building fat-binary wheels containing all architectures without needing the target GPU present at build time. Key changes: - Split pybind.cu into per-file PYBIND11_MODULE definitions - Add `cula/cudac.py` proxy module for backwards-compatible imports - Add `CULA_BUILD_ALL_ARCHS=1` env var to enable all SM targets - Add `--fat` flag to build_wheel.sh for CI fat-binary builds - Pin dependency versions and use `no-local-version` scheme for reproducible wheel filenames - Use setuptools_scm for dynamic `__version__` - Document pre-built wheel installation in README --- .github/workflows/build-release.yml | 125 +++++++++++++++++++++++ README.md | 12 +++ csrc/api/kda_sm100.cu | 6 ++ csrc/api/kda_sm90.cu | 5 + csrc/api/pybind.cu | 80 --------------- cula/__init__.py | 5 +- cula/cudac.py | 65 ++++++++++++ pyproject.toml | 9 +- scripts/build_wheel.sh | 18 +++- setup.py | 147 ++++++++++++++-------------- 10 files changed, 309 insertions(+), 163 deletions(-) create mode 100644 .github/workflows/build-release.yml delete mode 100644 csrc/api/pybind.cu create mode 100644 cula/cudac.py diff --git a/.github/workflows/build-release.yml b/.github/workflows/build-release.yml new file mode 100644 index 00000000..cd5b54d0 --- /dev/null +++ b/.github/workflows/build-release.yml @@ -0,0 +1,125 @@ +name: Build & Release Wheels + +on: + push: + tags: + - "v*" + workflow_dispatch: + +concurrency: + group: build-release-${{ github.ref }} + cancel-in-progress: true + +jobs: + build-wheel: + name: "wheel / ${{ matrix.cuda }} / cp312 / ${{ matrix.arch }}" + runs-on: ${{ matrix.runner }} + strategy: + fail-fast: false + matrix: + cuda: + - cu129 + - cu130 + arch: + - x86_64 + - aarch64 + include: + - cuda: cu129 + cuda_version: "12.9.0" + torch_index: "https://download.pytorch.org/whl/cu129" + - cuda: cu130 + cuda_version: "13.0.0" + torch_index: "https://download.pytorch.org/whl/cu130" + - arch: x86_64 + runner: ubuntu-latest + - arch: aarch64 + runner: ubuntu-24.04-arm + container: + image: "nvidia/cuda:${{ matrix.cuda_version }}-devel-ubuntu24.04" + + steps: + - name: Free disk space + run: | + rm -rf /opt/hostedtoolcache /usr/local/lib/android /usr/share/dotnet \ + /usr/local/share/boost /opt/ghc 2>/dev/null || true + apt-get clean 2>/dev/null || true + df -h / || true + + - name: Install git + run: | + apt-get update && apt-get install -y --no-install-recommends git \ + && rm -rf /var/lib/apt/lists/* + + - name: Checkout + uses: actions/checkout@v5 + with: + fetch-depth: 0 + submodules: recursive + + - name: Configure git safe directory + run: git config --global --add safe.directory "$GITHUB_WORKSPACE" + + - name: Set up Python + uses: actions/setup-python@v6 + with: + python-version: "3.12" + + - name: Install Python dependencies + run: | + python -m pip install --no-cache-dir --upgrade pip + python -m pip install --no-cache-dir torch --index-url ${{ matrix.torch_index }} + python -m pip install --no-cache-dir setuptools wheel "setuptools_scm>=6.0" build ninja + + - name: Compute version + id: version + run: | + if [[ "$GITHUB_REF" == refs/tags/v* ]]; then + BASE="${GITHUB_REF#refs/tags/v}" + else + # Strip any local segment (+gXXX) so we get a clean base + BASE=$(python -c "from setuptools_scm import get_version; print(get_version().split('+')[0])") + fi + echo "version=${BASE}+${{ matrix.cuda }}" >> "$GITHUB_OUTPUT" + + - name: Build fat-binary wheel + env: + CULA_BUILD_ALL_ARCHS: "1" + SETUPTOOLS_SCM_PRETEND_VERSION: "${{ steps.version.outputs.version }}" + NVCC_THREADS: "4" + MAX_JOBS: "4" + run: python -m build --wheel --no-isolation + + - name: Verify wheel + run: | + echo "Built wheel:" + ls -lh dist/*.whl + ls dist/*.whl | grep -q "+${{ matrix.cuda }}" \ + || { echo "ERROR: wheel name missing +${{ matrix.cuda }} suffix"; exit 1; } + + - name: Upload wheel artifact + uses: actions/upload-artifact@v6 + with: + name: wheel-${{ matrix.cuda }}-${{ matrix.arch }} + path: dist/*.whl + + release: + name: Create GitHub Release + needs: [build-wheel] + runs-on: ubuntu-latest + if: startsWith(github.ref, 'refs/tags/v') + permissions: + contents: write + steps: + - name: Download all artifacts + uses: actions/download-artifact@v6 + with: + path: artifacts/ + + - name: Create release + uses: softprops/action-gh-release@v3 + with: + files: | + artifacts/wheel-*/*.whl + generate_release_notes: true + draft: true + prerelease: ${{ contains(github.ref, 'rc') || contains(github.ref, 'beta') || contains(github.ref, 'alpha') }} diff --git a/README.md b/README.md index 7bed61e2..16e8e7b6 100644 --- a/README.md +++ b/README.md @@ -24,6 +24,18 @@ cuLA supports both **Hopper (SM90)** and **Blackwell (SM10X)** GPUs. > **Note:** The PyTorch CUDA version must match your system CUDA Toolkit version. Check with `nvcc --version` and `python -c "import torch; print(torch.version.cuda)"`. +### Pre-built Wheels + +Pre-built fat-binary wheels (SM90 + SM100 + SM103) are available on [GitHub Releases](https://github.com/inclusionAI/cuLA/releases): + +```bash +pip install cuda-linear-attention -f https://github.com/inclusionAI/cuLA/releases/expanded_assets/ +``` + +Replace `` with the release version (e.g., `v0.2.0`). `pip` will select the wheel matching your local CUDA version. Or download the `.whl` file directly from the [Releases page](https://github.com/inclusionAI/cuLA/releases) and install it with `pip install .whl`. + +### Build from Source + **Clone cuLA & dependencies:** ```bash diff --git a/csrc/api/kda_sm100.cu b/csrc/api/kda_sm100.cu index 7edca370..020d90ca 100644 --- a/csrc/api/kda_sm100.cu +++ b/csrc/api/kda_sm100.cu @@ -188,4 +188,10 @@ ChunkKDAFwdRecompWU( StaticPersistentTileScheduler::Params{tile_num, params.h_v, params.heads_per_group, params.num_sm, nullptr}; kda::sm100::run_kda_fwd_recomp_w_u_sm100(params, at::cuda::getCurrentCUDAStream()); +} + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.doc() = "cuLA SM100/SM103 kernels"; + m.def("chunk_kda_fwd_intra_cuda", &ChunkKDAFwdIntra); + m.def("recompute_w_u_cuda", &ChunkKDAFwdRecompWU); } \ No newline at end of file diff --git a/csrc/api/kda_sm90.cu b/csrc/api/kda_sm90.cu index 9e016eb1..d80df7cc 100644 --- a/csrc/api/kda_sm90.cu +++ b/csrc/api/kda_sm90.cu @@ -191,3 +191,8 @@ kda_fwd_prefill( return {output, output_state}; } + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.doc() = "cuLA SM90 kernels"; + m.def("kda_fwd_prefill", &kda_fwd_prefill); +} diff --git a/csrc/api/pybind.cu b/csrc/api/pybind.cu deleted file mode 100644 index d14a41c5..00000000 --- a/csrc/api/pybind.cu +++ /dev/null @@ -1,80 +0,0 @@ -// Copyright 2025-2026 Ant Group Co., Ltd. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include - -#if defined(CULA_SM100_ENABLED) || defined(CULA_SM103_ENABLED) -void -ChunkKDAFwdIntra( - at::Tensor q, - at::Tensor k, - at::Tensor g, - at::Tensor beta, - at::Tensor cu_seqlens, - at::Tensor chunk_indices, - at::Tensor Aqk_out, - at::Tensor Akk_out, - at::Tensor tile_counter, - float scale, - int chunk_size, - bool use_tf32_inverse, - bool unified_gref); -void -ChunkKDAFwdRecompWU( - at::Tensor k, - at::Tensor v, - at::Tensor beta, - at::Tensor A, - at::Tensor g, - at::Tensor cu_seqlens, - at::Tensor chunk_indices, - at::Tensor w_out, - at::Tensor u_out, - at::Tensor kg_out, - int chunk_size, - std::optional q, - std::optional qg_out); -#endif - -#if defined(CULA_SM90A_ENABLED) -std::tuple> -kda_fwd_prefill( - std::optional output_, - std::optional output_state_, - torch::Tensor const& q, - torch::Tensor const& k, - torch::Tensor const& v, - std::optional input_state_, - std::optional alpha_, - std::optional beta_, - torch::Tensor const& cu_seqlens, - torch::Tensor workspace_buffer, - float scale, - bool output_final_state, - bool safe_gate); -#endif - -PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { - m.doc() = "cuLA"; -#if defined(CULA_SM100_ENABLED) || defined(CULA_SM103_ENABLED) - m.def("chunk_kda_fwd_intra_cuda", &ChunkKDAFwdIntra); - m.def("recompute_w_u_cuda", &ChunkKDAFwdRecompWU); -#endif -#if defined(CULA_SM90A_ENABLED) - m.def("kda_fwd_prefill", &kda_fwd_prefill); -#endif -} diff --git a/cula/__init__.py b/cula/__init__.py index 7272e289..6e13aa13 100644 --- a/cula/__init__.py +++ b/cula/__init__.py @@ -12,7 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.1.0" +try: + from cula._version import version as __version__ +except ImportError: + __version__ = "0.1.0" from cula.ops.lightning_attn_sm100 import LinearAttentionChunkwiseDecay diff --git a/cula/cudac.py b/cula/cudac.py new file mode 100644 index 00000000..287d9747 --- /dev/null +++ b/cula/cudac.py @@ -0,0 +1,65 @@ +# Copyright 2025-2026 Ant Group Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Unified interface to per-architecture CUDA extensions. + +Downstream code can continue to use ``import cula.cudac as cula_cuda`` +and call ``cula_cuda.kda_fwd_prefill(...)`` or +``cula_cuda.chunk_kda_fwd_intra_cuda(...)`` without knowing which +extension provides the function. +""" + +import importlib +import sys +from types import ModuleType + + +class _CudacProxy(ModuleType): + """Lazy proxy that exposes functions from all built arch extensions.""" + + def __init__(self): + super().__init__(__name__) + self.__path__ = [] + self._modules_loaded = False + self._funcs: dict[str, object] = {} + + def _load(self): + if self._modules_loaded: + return + self._modules_loaded = True + for ext_name in ("cula._cudac_sm100", "cula._cudac_sm90"): + try: + mod = importlib.import_module(ext_name) + for attr in dir(mod): + if not attr.startswith("_"): + self._funcs[attr] = getattr(mod, attr) + except ImportError: + pass + self.__dict__.update(self._funcs) + + def __getattr__(self, name: str): + if name.startswith("_"): + raise AttributeError(name) + self._load() + try: + return self._funcs[name] + except KeyError: + raise AttributeError(f"module 'cula.cudac' has no attribute '{name}'") from None + + def __dir__(self): + self._load() + return list(self._funcs.keys()) + + +sys.modules[__name__] = _CudacProxy() diff --git a/pyproject.toml b/pyproject.toml index ef1a531b..a6c34562 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,8 +11,8 @@ readme = "README.md" authors = [ { name = "cula contributors" } ] requires-python = ">=3.10" dependencies = [ - "nvidia-cutlass-dsl>=4.4.2", - "apache-tvm-ffi>=0.1.9", + "nvidia-cutlass-dsl==4.4.2", + "apache-tvm-ffi==0.1.9", ] license = { text = "Apache-2.0" } @@ -84,9 +84,6 @@ force-sort-within-sections = false "cula/kda/blackwell_fused_fwd.py" = ["F821"] [tool.setuptools_scm] -# write generated version into package for runtime access write_to = "cula/_version.py" -# add a date-based local suffix when needed -local_scheme = "node-and-date" -# fallback for non-git sources +local_scheme = "no-local-version" fallback_version = "0.1.0" diff --git a/scripts/build_wheel.sh b/scripts/build_wheel.sh index 42b35665..79ac3305 100755 --- a/scripts/build_wheel.sh +++ b/scripts/build_wheel.sh @@ -18,10 +18,19 @@ cd "$REPO_ROOT" # Parse args ISOLATION_FLAG="--no-isolation" -if [[ "${1:-}" == "--isolated" ]]; then - ISOLATION_FLAG="" - echo "[build_wheel] Using isolated build environment" -else +for arg in "$@"; do + case "$arg" in + --isolated) + ISOLATION_FLAG="" + echo "[build_wheel] Using isolated build environment" + ;; + --fat) + export CULA_BUILD_ALL_ARCHS=1 + echo "[build_wheel] Fat binary: building for all SM architectures" + ;; + esac +done +if [[ "$ISOLATION_FLAG" == "--no-isolation" ]]; then echo "[build_wheel] Using current environment (--no-isolation)" fi @@ -33,6 +42,7 @@ rm -rf dist build *.egg-info echo "[build_wheel] Python: $(python -V 2>&1)" echo "[build_wheel] torch: $(python -c 'import torch; print(torch.__version__)' 2>/dev/null || echo 'not installed')" echo "[build_wheel] CUDA: $(nvcc --version 2>/dev/null | grep 'release' | sed 's/.*release //' | sed 's/,.*//' || echo 'not found')" +echo "[build_wheel] Fat binary: ${CULA_BUILD_ALL_ARCHS:-0}" # Build wheel echo "[build_wheel] Building wheel..." diff --git a/setup.py b/setup.py index f7b11b95..0e127e81 100644 --- a/setup.py +++ b/setup.py @@ -46,13 +46,15 @@ def detect_gpu_archs() -> tuple[bool, bool, bool]: def resolve_disable_flag(env_name: str, detected: bool) -> bool: """ Resolve whether to disable a given SM target. + - If CULA_BUILD_ALL_ARCHS is set, all targets are enabled unconditionally. - If the environment variable is explicitly set, honour it. - Otherwise, disable the target when no matching GPU is detected. """ + if os.getenv("CULA_BUILD_ALL_ARCHS", "0") == "1": + return False env_val = os.getenv(env_name) if env_val is not None: return env_val.lower() in ["true", "1", "y", "yes"] - # Auto-detect: disable if no matching device found disable = not detected if disable: print(f" No matching GPU detected; auto-setting {env_name}=1 (disable). Set {env_name}=0 to override.") @@ -66,7 +68,11 @@ def get_features_args(): USE_FAST_MATH = os.getenv("CULA_USE_FAST_MATH", "1") == "1" -print("Detecting GPU architectures...") +if os.getenv("CULA_BUILD_ALL_ARCHS", "0") == "1": + print("CULA_BUILD_ALL_ARCHS=1: enabling all SM targets (sm90a, sm100a, sm103a)") +else: + print("Detecting GPU architectures...") + _has_sm100, _has_sm103, _has_sm90 = detect_gpu_archs() DISABLE_SM100 = resolve_disable_flag("CULA_DISABLE_SM100", _has_sm100) DISABLE_SM103 = resolve_disable_flag("CULA_DISABLE_SM103", _has_sm103) @@ -111,26 +117,6 @@ def assert_blackwell_build_env() -> None: ) -def get_arch_flags(): - major, minor = get_nvcc_version() - print(f"Compiling using NVCC {major}.{minor}") - - # Validate Blackwell build environment - assert_blackwell_build_env() - - arch_flags = [] - if not DISABLE_SM100: - arch_flags.extend(["-gencode", "arch=compute_100a,code=sm_100a"]) - arch_flags.extend(["-DCULA_SM100_ENABLED"]) - if not DISABLE_SM103: - arch_flags.extend(["-gencode", "arch=compute_103a,code=sm_103a"]) - arch_flags.extend(["-DCULA_SM103_ENABLED"]) - if not DISABLE_SM90: - arch_flags.extend(["-gencode", "arch=compute_90a,code=sm_90a"]) - arch_flags.extend(["-DCULA_SM90A_ENABLED"]) - return arch_flags - - def get_nvcc_thread_args(): nvcc_threads = os.getenv("NVCC_THREADS") or "32" return ["--threads", nvcc_threads] @@ -145,61 +131,78 @@ def get_nvcc_thread_args(): else: cxx_args = ["-O3", "-std=c++20", "-DNDEBUG", "-Wno-deprecated-declarations"] -cuda_sources = [ - "csrc/api/pybind.cu", +nvcc_common_args = [ + "-O3", + "-std=c++20", + "-DNDEBUG", + # "-D_USE_MATH_DEFINES", + "-Wno-deprecated-declarations", + "-U__CUDA_NO_HALF_OPERATORS__", + "-U__CUDA_NO_HALF_CONVERSIONS__", + "-U__CUDA_NO_HALF2_OPERATORS__", + "-U__CUDA_NO_BFLOAT16_CONVERSIONS__", + "--expt-relaxed-constexpr", + "--expt-extended-lambda", + "-lineinfo", + "--ptxas-options=--verbose,--register-usage-level=10,--warn-on-local-memory-usage", + "-diag-suppress=3189", ] + +include_dirs = [ + Path(this_dir) / "csrc", + Path(this_dir) / "csrc" / "kerutils" / "include", + Path(this_dir) / "csrc" / "cutlass" / "include", + Path(this_dir) / "csrc" / "cutlass" / "tools" / "util" / "include", +] + +major, minor = get_nvcc_version() +print(f"Compiling using NVCC {major}.{minor}") +assert_blackwell_build_env() + +ext_modules = [] + if not DISABLE_SM100 or not DISABLE_SM103: - cuda_sources.extend( - [ - "csrc/api/kda_sm100.cu", - "csrc/kda/sm100/kda_fwd_sm100.cu", - ] - ) -if not DISABLE_SM90: - cuda_sources.extend( - [ - "csrc/api/kda_sm90.cu", - "csrc/kda/sm90/kda_fwd_sm90.cu", - "csrc/kda/sm90/kda_fwd_sm90_safe_gate.cu", - ] + sm100_arch_flags = [] + if not DISABLE_SM100: + sm100_arch_flags.extend(["-gencode", "arch=compute_100a,code=sm_100a", "-DCULA_SM100_ENABLED"]) + if not DISABLE_SM103: + sm100_arch_flags.extend(["-gencode", "arch=compute_103a,code=sm_103a", "-DCULA_SM103_ENABLED"]) + + ext_modules.append( + CUDAExtension( + name="cula._cudac_sm100", + sources=[ + "csrc/api/kda_sm100.cu", + "csrc/kda/sm100/kda_fwd_sm100.cu", + ], + extra_compile_args={ + "cxx": cxx_args + get_features_args(), + "nvcc": nvcc_common_args + get_features_args() + sm100_arch_flags + + get_nvcc_thread_args() + (["--use_fast_math"] if USE_FAST_MATH else []), + }, + include_dirs=include_dirs, + ) ) -ext_modules = [] -ext_modules.append( - CUDAExtension( - name="cula.cudac", - sources=cuda_sources, - extra_compile_args={ - "cxx": cxx_args + get_features_args(), - "nvcc": [ - "-O3", - "-std=c++20", - "-DNDEBUG", - # "-D_USE_MATH_DEFINES", - "-Wno-deprecated-declarations", - "-U__CUDA_NO_HALF_OPERATORS__", - "-U__CUDA_NO_HALF_CONVERSIONS__", - "-U__CUDA_NO_HALF2_OPERATORS__", - "-U__CUDA_NO_BFLOAT16_CONVERSIONS__", - "--expt-relaxed-constexpr", - "--expt-extended-lambda", - "-lineinfo", - "--ptxas-options=--verbose,--register-usage-level=10,--warn-on-local-memory-usage", - "-diag-suppress=3189", # suppress the warning of torch in C++ 20 - ] - + get_features_args() - + get_arch_flags() - + get_nvcc_thread_args() - + (["--use_fast_math"] if USE_FAST_MATH else []), - }, - include_dirs=[ - Path(this_dir) / "csrc", - Path(this_dir) / "csrc" / "kerutils" / "include", - Path(this_dir) / "csrc" / "cutlass" / "include", - Path(this_dir) / "csrc" / "cutlass" / "tools" / "util" / "include", - ], +if not DISABLE_SM90: + sm90_arch_flags = ["-gencode", "arch=compute_90a,code=sm_90a", "-DCULA_SM90A_ENABLED"] + + ext_modules.append( + CUDAExtension( + name="cula._cudac_sm90", + sources=[ + "csrc/api/kda_sm90.cu", + "csrc/kda/sm90/kda_fwd_sm90.cu", + "csrc/kda/sm90/kda_fwd_sm90_safe_gate.cu", + ], + extra_compile_args={ + "cxx": cxx_args + get_features_args(), + "nvcc": nvcc_common_args + get_features_args() + sm90_arch_flags + + get_nvcc_thread_args() + (["--use_fast_math"] if USE_FAST_MATH else []), + }, + include_dirs=include_dirs, + ) ) -) setup( name="cuda-linear-attention", From 761673708d326bffe3d102aff3e09610e897e088 Mon Sep 17 00:00:00 2001 From: tongke Date: Fri, 12 Jun 2026 11:38:28 +0800 Subject: [PATCH 02/14] fix ruff lint errors --- setup.py | 14 ++++++++++---- tests/conftest.py | 7 ++----- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/setup.py b/setup.py index 0e127e81..3df211b5 100644 --- a/setup.py +++ b/setup.py @@ -177,8 +177,11 @@ def get_nvcc_thread_args(): ], extra_compile_args={ "cxx": cxx_args + get_features_args(), - "nvcc": nvcc_common_args + get_features_args() + sm100_arch_flags - + get_nvcc_thread_args() + (["--use_fast_math"] if USE_FAST_MATH else []), + "nvcc": nvcc_common_args + + get_features_args() + + sm100_arch_flags + + get_nvcc_thread_args() + + (["--use_fast_math"] if USE_FAST_MATH else []), }, include_dirs=include_dirs, ) @@ -197,8 +200,11 @@ def get_nvcc_thread_args(): ], extra_compile_args={ "cxx": cxx_args + get_features_args(), - "nvcc": nvcc_common_args + get_features_args() + sm90_arch_flags - + get_nvcc_thread_args() + (["--use_fast_math"] if USE_FAST_MATH else []), + "nvcc": nvcc_common_args + + get_features_args() + + sm90_arch_flags + + get_nvcc_thread_args() + + (["--use_fast_math"] if USE_FAST_MATH else []), }, include_dirs=include_dirs, ) diff --git a/tests/conftest.py b/tests/conftest.py index f144c10b..a9338aca 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,4 +1,5 @@ import re + import pytest import torch @@ -56,9 +57,5 @@ def pytest_collection_modifyitems(config, items): item.add_marker(skip_slow) continue callspec = getattr(item, "callspec", None) - if ( - callspec is not None - and callspec.params.get("disable_recompute") - and "kda_fast_norecomp" not in item.keywords - ): + if callspec is not None and callspec.params.get("disable_recompute") and "kda_fast_norecomp" not in item.keywords: item.add_marker(skip_fast_norecomp) From b148c526d513062da686aa5c00c30c67590bca16 Mon Sep 17 00:00:00 2001 From: tongke Date: Fri, 12 Jun 2026 11:52:37 +0800 Subject: [PATCH 03/14] revert version requirements changes --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index a6c34562..fe93e562 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,8 +11,8 @@ readme = "README.md" authors = [ { name = "cula contributors" } ] requires-python = ">=3.10" dependencies = [ - "nvidia-cutlass-dsl==4.4.2", - "apache-tvm-ffi==0.1.9", + "nvidia-cutlass-dsl>=4.4.2", + "apache-tvm-ffi>=0.1.9", ] license = { text = "Apache-2.0" } From 2c56304dc128cb8935238a7566e91528cd9c296e Mon Sep 17 00:00:00 2001 From: tongke Date: Fri, 12 Jun 2026 12:02:35 +0800 Subject: [PATCH 04/14] Make cudac proxy thread-safe and raise on missing extensions Add double-checked locking to _CudacProxy._load() to prevent race conditions in multi-threaded environments. Raise a descriptive ImportError when no CUDA extensions can be loaded instead of silently producing AttributeError later. --- cula/cudac.py | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/cula/cudac.py b/cula/cudac.py index 287d9747..c256f6ef 100644 --- a/cula/cudac.py +++ b/cula/cudac.py @@ -22,6 +22,7 @@ import importlib import sys +import threading from types import ModuleType @@ -33,20 +34,31 @@ def __init__(self): self.__path__ = [] self._modules_loaded = False self._funcs: dict[str, object] = {} + self._lock = threading.Lock() def _load(self): if self._modules_loaded: return - self._modules_loaded = True - for ext_name in ("cula._cudac_sm100", "cula._cudac_sm90"): - try: - mod = importlib.import_module(ext_name) - for attr in dir(mod): - if not attr.startswith("_"): - self._funcs[attr] = getattr(mod, attr) - except ImportError: - pass - self.__dict__.update(self._funcs) + with self._lock: + if self._modules_loaded: + return + loaded_any = False + for ext_name in ("cula._cudac_sm100", "cula._cudac_sm90"): + try: + mod = importlib.import_module(ext_name) + for attr in dir(mod): + if not attr.startswith("_"): + self._funcs[attr] = getattr(mod, attr) + loaded_any = True + except ImportError: + pass + if not loaded_any: + raise ImportError( + "None of the cuLA CUDA extensions ('cula._cudac_sm100', 'cula._cudac_sm90') " + "could be imported. Please make sure cuLA is compiled correctly." + ) + self.__dict__.update(self._funcs) + self._modules_loaded = True def __getattr__(self, name: str): if name.startswith("_"): From c955d4762867c7986d58a138edac187fd803e0aa Mon Sep 17 00:00:00 2001 From: yz262713 Date: Wed, 17 Jun 2026 19:36:51 +0800 Subject: [PATCH 05/14] Surface per-extension import errors in cudac proxy The blanket `except ImportError: pass` swallowed the actual failure reason, making it impossible to diagnose missing shared libraries or build issues. Collect each extension's ImportError and include them in the raised message. --- cula/cudac.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/cula/cudac.py b/cula/cudac.py index c256f6ef..8dfcc49b 100644 --- a/cula/cudac.py +++ b/cula/cudac.py @@ -43,6 +43,7 @@ def _load(self): if self._modules_loaded: return loaded_any = False + errors: dict[str, Exception] = {} for ext_name in ("cula._cudac_sm100", "cula._cudac_sm90"): try: mod = importlib.import_module(ext_name) @@ -50,12 +51,14 @@ def _load(self): if not attr.startswith("_"): self._funcs[attr] = getattr(mod, attr) loaded_any = True - except ImportError: - pass + except ImportError as exc: + errors[ext_name] = exc if not loaded_any: + details = "; ".join(f"{name}: {exc}" for name, exc in errors.items()) raise ImportError( - "None of the cuLA CUDA extensions ('cula._cudac_sm100', 'cula._cudac_sm90') " - "could be imported. Please make sure cuLA is compiled correctly." + "None of the cuLA CUDA extensions could be imported. " + f"Per-extension errors: [{details}]. " + "Please make sure cuLA is compiled correctly." ) self.__dict__.update(self._funcs) self._modules_loaded = True From 6ccb8fac78c7ade3b16ed9f30f1734f55b0b0a8f Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 17 Jun 2026 14:47:35 +0000 Subject: [PATCH 06/14] Fix build-release matrix with DRY expression mapping --- .github/workflows/build-release.yml | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/.github/workflows/build-release.yml b/.github/workflows/build-release.yml index cd5b54d0..ae15a184 100644 --- a/.github/workflows/build-release.yml +++ b/.github/workflows/build-release.yml @@ -13,7 +13,7 @@ concurrency: jobs: build-wheel: name: "wheel / ${{ matrix.cuda }} / cp312 / ${{ matrix.arch }}" - runs-on: ${{ matrix.runner }} + runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-latest' }} strategy: fail-fast: false matrix: @@ -23,19 +23,8 @@ jobs: arch: - x86_64 - aarch64 - include: - - cuda: cu129 - cuda_version: "12.9.0" - torch_index: "https://download.pytorch.org/whl/cu129" - - cuda: cu130 - cuda_version: "13.0.0" - torch_index: "https://download.pytorch.org/whl/cu130" - - arch: x86_64 - runner: ubuntu-latest - - arch: aarch64 - runner: ubuntu-24.04-arm container: - image: "nvidia/cuda:${{ matrix.cuda_version }}-devel-ubuntu24.04" + image: "nvidia/cuda:${{ matrix.cuda == 'cu129' && '12.9.0' || '13.0.0' }}-devel-ubuntu24.04" steps: - name: Free disk space @@ -67,7 +56,7 @@ jobs: - name: Install Python dependencies run: | python -m pip install --no-cache-dir --upgrade pip - python -m pip install --no-cache-dir torch --index-url ${{ matrix.torch_index }} + python -m pip install --no-cache-dir torch --index-url ${{ matrix.cuda == 'cu129' && 'https://download.pytorch.org/whl/cu129' || 'https://download.pytorch.org/whl/cu130' }} python -m pip install --no-cache-dir setuptools wheel "setuptools_scm>=6.0" build ninja - name: Compute version From f700810b816c79af0c56351711940d3fc7d8c292 Mon Sep 17 00:00:00 2001 From: tongke <124763920+tongke6@users.noreply.github.com> Date: Wed, 17 Jun 2026 22:51:28 +0800 Subject: [PATCH 07/14] Apply suggestions from code review Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- cula/cudac.py | 4 +++- setup.py | 4 ++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/cula/cudac.py b/cula/cudac.py index 8dfcc49b..1bbaf108 100644 --- a/cula/cudac.py +++ b/cula/cudac.py @@ -77,4 +77,6 @@ def __dir__(self): return list(self._funcs.keys()) -sys.modules[__name__] = _CudacProxy() +_proxy = _CudacProxy() +_proxy.__dict__.update({k: globals().get(k) for k in ("__spec__", "__file__", "__package__", "__loader__")}) +sys.modules[__name__] = _proxy diff --git a/setup.py b/setup.py index 3df211b5..78c61e5c 100644 --- a/setup.py +++ b/setup.py @@ -164,9 +164,9 @@ def get_nvcc_thread_args(): if not DISABLE_SM100 or not DISABLE_SM103: sm100_arch_flags = [] if not DISABLE_SM100: - sm100_arch_flags.extend(["-gencode", "arch=compute_100a,code=sm_100a", "-DCULA_SM100_ENABLED"]) + sm100_arch_flags.extend(["-gencode", "arch=compute_100a,code=sm_100a"]) if not DISABLE_SM103: - sm100_arch_flags.extend(["-gencode", "arch=compute_103a,code=sm_103a", "-DCULA_SM103_ENABLED"]) + sm100_arch_flags.extend(["-gencode", "arch=compute_103a,code=sm_103a"]) ext_modules.append( CUDAExtension( From e0a1e214179fd5e53dcfe5bb2e114909e104e91b Mon Sep 17 00:00:00 2001 From: tongke <124763920+tongke6@users.noreply.github.com> Date: Wed, 17 Jun 2026 23:06:57 +0800 Subject: [PATCH 08/14] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- README.md | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 16e8e7b6..eabf435b 100644 --- a/README.md +++ b/README.md @@ -28,11 +28,9 @@ cuLA supports both **Hopper (SM90)** and **Blackwell (SM10X)** GPUs. Pre-built fat-binary wheels (SM90 + SM100 + SM103) are available on [GitHub Releases](https://github.com/inclusionAI/cuLA/releases): -```bash -pip install cuda-linear-attention -f https://github.com/inclusionAI/cuLA/releases/expanded_assets/ -``` + pip install "cuda-linear-attention==+" -f https://github.com/inclusionAI/cuLA/releases/expanded_assets/ -Replace `` with the release version (e.g., `v0.2.0`). `pip` will select the wheel matching your local CUDA version. Or download the `.whl` file directly from the [Releases page](https://github.com/inclusionAI/cuLA/releases) and install it with `pip install .whl`. +Replace `` with the release tag (e.g., `v0.2.0`), `` with the base version (e.g., `0.2.0`), and `` with your PyTorch CUDA build tag (e.g., `cu129` or `cu130`). Or download the `.whl` file directly from the [Releases page](https://github.com/inclusionAI/cuLA/releases) and install it with `pip install .whl`. ### Build from Source From ead4f7eb5204194854f8f3a7dbddca03a24dedbb Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 17 Jun 2026 15:14:59 +0000 Subject: [PATCH 09/14] Add README example for building fat wheels --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index eabf435b..7502e302 100644 --- a/README.md +++ b/README.md @@ -57,6 +57,12 @@ pip install -e third_party/flash-linear-attention pip install -e . --no-build-isolation ``` +**Build fat wheel (SM90 + SM100 + SM103):** + +```bash +CULA_BUILD_ALL_ARCHS=1 python -m build --wheel --no-isolation +``` + ## Quick Start ### KDA (Kimi Delta Attention) — Blackwell (SM10X) From 75f171066e94be3e42f4c59c0745e762a691f704 Mon Sep 17 00:00:00 2001 From: tongke Date: Mon, 22 Jun 2026 19:13:26 +0800 Subject: [PATCH 10/14] Surface partial-extension failures in cudac proxy Catch (ImportError, AttributeError, OSError) when scanning per-arch extensions: pybind11 modules commonly surface missing-symbol / ABI / libcudart failures as AttributeError or OSError rather than ImportError, so the prior narrow catch silently dropped one extension's failure when another succeeded, leaving its kernels missing without diagnostic. Emit a UserWarning naming each failing extension on partial failure (all-fail still raises ImportError), preserving the c955d47 intent of surfacing per-extension errors. Also document the load-once-per-process semantics in the module docstring. --- cula/cudac.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/cula/cudac.py b/cula/cudac.py index 1bbaf108..23f96c97 100644 --- a/cula/cudac.py +++ b/cula/cudac.py @@ -18,11 +18,19 @@ and call ``cula_cuda.kda_fwd_prefill(...)`` or ``cula_cuda.chunk_kda_fwd_intra_cuda(...)`` without knowing which extension provides the function. + +Loading is **once per process**: the first attribute access triggers a +single threaded scan of every built ``cula._cudac_sm*`` extension; the +discovered callables are then cached on the module instance and no +further re-scan happens. Installing or rebuilding an extension after a +process has already imported ``cula.cudac`` will therefore not be picked +up -- callers that need a freshly built extension must restart Python. """ import importlib import sys import threading +import warnings from types import ModuleType @@ -44,6 +52,10 @@ def _load(self): return loaded_any = False errors: dict[str, Exception] = {} + # pybind11 extensions surface missing-symbol / ABI / libcudart + # failures as AttributeError or OSError at import time rather + # than ImportError, so catch the broader set to keep matching + # the c955d47 intent of surfacing every per-extension failure. for ext_name in ("cula._cudac_sm100", "cula._cudac_sm90"): try: mod = importlib.import_module(ext_name) @@ -51,7 +63,7 @@ def _load(self): if not attr.startswith("_"): self._funcs[attr] = getattr(mod, attr) loaded_any = True - except ImportError as exc: + except (ImportError, AttributeError, OSError) as exc: errors[ext_name] = exc if not loaded_any: details = "; ".join(f"{name}: {exc}" for name, exc in errors.items()) @@ -60,6 +72,16 @@ def _load(self): f"Per-extension errors: [{details}]. " "Please make sure cuLA is compiled correctly." ) + # Partial failures are not fatal (each surviving extension is + # usable), but the user still needs to know which kernel sets + # are missing so they can diagnose a partial / mismatched build. + if errors: + details = "; ".join(f"{name}: {exc}" for name, exc in errors.items()) + warnings.warn( + "Some cuLA CUDA extensions could not be imported and their " + f"kernels are unavailable. Per-extension errors: [{details}].", + stacklevel=2, + ) self.__dict__.update(self._funcs) self._modules_loaded = True From ceab48b5f83866bb869bda0864bd7a1580557497 Mon Sep 17 00:00:00 2001 From: tongke Date: Mon, 22 Jun 2026 20:00:03 +0800 Subject: [PATCH 11/14] Build release wheels against manylinux_2_28 --- .github/workflows/build-release.yml | 29 +++++++++++++++++++++++------ README.md | 4 ++-- 2 files changed, 25 insertions(+), 8 deletions(-) diff --git a/.github/workflows/build-release.yml b/.github/workflows/build-release.yml index ae15a184..e2cdd5cf 100644 --- a/.github/workflows/build-release.yml +++ b/.github/workflows/build-release.yml @@ -24,20 +24,21 @@ jobs: - x86_64 - aarch64 container: - image: "nvidia/cuda:${{ matrix.cuda == 'cu129' && '12.9.0' || '13.0.0' }}-devel-ubuntu24.04" + # UBI 8 provides the glibc 2.28 baseline required by manylinux_2_28. + image: "nvidia/cuda:${{ matrix.cuda == 'cu129' && '12.9.0' || '13.0.0' }}-devel-ubi8" steps: - name: Free disk space run: | rm -rf /opt/hostedtoolcache /usr/local/lib/android /usr/share/dotnet \ /usr/local/share/boost /opt/ghc 2>/dev/null || true - apt-get clean 2>/dev/null || true + dnf clean all 2>/dev/null || true df -h / || true - name: Install git run: | - apt-get update && apt-get install -y --no-install-recommends git \ - && rm -rf /var/lib/apt/lists/* + dnf install -y git + dnf clean all - name: Checkout uses: actions/checkout@v5 @@ -57,7 +58,7 @@ jobs: run: | python -m pip install --no-cache-dir --upgrade pip python -m pip install --no-cache-dir torch --index-url ${{ matrix.cuda == 'cu129' && 'https://download.pytorch.org/whl/cu129' || 'https://download.pytorch.org/whl/cu130' }} - python -m pip install --no-cache-dir setuptools wheel "setuptools_scm>=6.0" build ninja + python -m pip install --no-cache-dir setuptools wheel "setuptools_scm>=6.0" build ninja auditwheel patchelf - name: Compute version id: version @@ -76,7 +77,20 @@ jobs: SETUPTOOLS_SCM_PRETEND_VERSION: "${{ steps.version.outputs.version }}" NVCC_THREADS: "4" MAX_JOBS: "4" - run: python -m build --wheel --no-isolation + run: python -m build --wheel --no-isolation --outdir dist-raw + + - name: Repair wheel for manylinux_2_28 + run: | + # These libraries are supplied by the NVIDIA driver, PyTorch, or + # PyTorch's CUDA runtime dependency and must remain external. + python -m auditwheel repair \ + --plat manylinux_2_28_${{ matrix.arch }} \ + --exclude libcuda.so.1 \ + --exclude 'libcudart.so.*' \ + --exclude 'libc10*.so' \ + --exclude 'libtorch*.so' \ + --wheel-dir dist \ + dist-raw/*.whl - name: Verify wheel run: | @@ -84,6 +98,9 @@ jobs: ls -lh dist/*.whl ls dist/*.whl | grep -q "+${{ matrix.cuda }}" \ || { echo "ERROR: wheel name missing +${{ matrix.cuda }} suffix"; exit 1; } + ls dist/*.whl | grep -q "manylinux_2_28_${{ matrix.arch }}" \ + || { echo "ERROR: wheel is not tagged manylinux_2_28_${{ matrix.arch }}"; exit 1; } + python -m auditwheel show dist/*.whl - name: Upload wheel artifact uses: actions/upload-artifact@v6 diff --git a/README.md b/README.md index 7502e302..09418811 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ cuLA supports both **Hopper (SM90)** and **Blackwell (SM10X)** GPUs. ### Pre-built Wheels -Pre-built fat-binary wheels (SM90 + SM100 + SM103) are available on [GitHub Releases](https://github.com/inclusionAI/cuLA/releases): +Pre-built fat-binary wheels (SM90 + SM100 + SM103) are available on [GitHub Releases](https://github.com/inclusionAI/cuLA/releases). Linux wheels target `manylinux_2_28` and require glibc 2.28 or newer: pip install "cuda-linear-attention==+" -f https://github.com/inclusionAI/cuLA/releases/expanded_assets/ @@ -255,4 +255,4 @@ No CUDA experience is required as long as you're a quick learner. For Q&A and discussion, you can join us through: - **Slack:** [cuLA Slack Community](https://join.slack.com/t/cula-hq/shared_invite/zt-3uaacvm9y-xJwZyGueeKxZRYQlj7~hxw) -- **WeChat:** The WeChat group has exceeded 200 members and can no longer be joined via QR code. To join, please send your WeChat ID to any of the following emails and we'll invite you: **chaofanyu@gmail.com** / **kevinzz08@foxmail.com** / **yzpag@gmail.com** / **haoc80996@gmail.com**. You can also ask someone already in the group to invite you directly. \ No newline at end of file +- **WeChat:** The WeChat group has exceeded 200 members and can no longer be joined via QR code. To join, please send your WeChat ID to any of the following emails and we'll invite you: **chaofanyu@gmail.com** / **kevinzz08@foxmail.com** / **yzpag@gmail.com** / **haoc80996@gmail.com**. You can also ask someone already in the group to invite you directly. From 430531b272cae3b112c5eec5711e5975a0aa4617 Mon Sep 17 00:00:00 2001 From: tongke Date: Mon, 22 Jun 2026 20:35:57 +0800 Subject: [PATCH 12/14] fix python 3.12 GLIBC compat problems on ubi8 --- .github/workflows/build-release.yml | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/.github/workflows/build-release.yml b/.github/workflows/build-release.yml index e2cdd5cf..88686f0f 100644 --- a/.github/workflows/build-release.yml +++ b/.github/workflows/build-release.yml @@ -35,9 +35,9 @@ jobs: dnf clean all 2>/dev/null || true df -h / || true - - name: Install git + - name: Install system dependencies run: | - dnf install -y git + dnf install -y git python3.12 python3.12-devel python3.12-pip dnf clean all - name: Checkout @@ -49,16 +49,11 @@ jobs: - name: Configure git safe directory run: git config --global --add safe.directory "$GITHUB_WORKSPACE" - - name: Set up Python - uses: actions/setup-python@v6 - with: - python-version: "3.12" - - name: Install Python dependencies run: | - python -m pip install --no-cache-dir --upgrade pip - python -m pip install --no-cache-dir torch --index-url ${{ matrix.cuda == 'cu129' && 'https://download.pytorch.org/whl/cu129' || 'https://download.pytorch.org/whl/cu130' }} - python -m pip install --no-cache-dir setuptools wheel "setuptools_scm>=6.0" build ninja auditwheel patchelf + python3.12 -m pip install --no-cache-dir --upgrade pip + python3.12 -m pip install --no-cache-dir torch --index-url ${{ matrix.cuda == 'cu129' && 'https://download.pytorch.org/whl/cu129' || 'https://download.pytorch.org/whl/cu130' }} + python3.12 -m pip install --no-cache-dir setuptools wheel "setuptools_scm>=6.0" build ninja auditwheel patchelf - name: Compute version id: version @@ -67,7 +62,7 @@ jobs: BASE="${GITHUB_REF#refs/tags/v}" else # Strip any local segment (+gXXX) so we get a clean base - BASE=$(python -c "from setuptools_scm import get_version; print(get_version().split('+')[0])") + BASE=$(python3.12 -c "from setuptools_scm import get_version; print(get_version().split('+')[0])") fi echo "version=${BASE}+${{ matrix.cuda }}" >> "$GITHUB_OUTPUT" @@ -77,13 +72,13 @@ jobs: SETUPTOOLS_SCM_PRETEND_VERSION: "${{ steps.version.outputs.version }}" NVCC_THREADS: "4" MAX_JOBS: "4" - run: python -m build --wheel --no-isolation --outdir dist-raw + run: python3.12 -m build --wheel --no-isolation --outdir dist-raw - name: Repair wheel for manylinux_2_28 run: | # These libraries are supplied by the NVIDIA driver, PyTorch, or # PyTorch's CUDA runtime dependency and must remain external. - python -m auditwheel repair \ + python3.12 -m auditwheel repair \ --plat manylinux_2_28_${{ matrix.arch }} \ --exclude libcuda.so.1 \ --exclude 'libcudart.so.*' \ @@ -100,7 +95,7 @@ jobs: || { echo "ERROR: wheel name missing +${{ matrix.cuda }} suffix"; exit 1; } ls dist/*.whl | grep -q "manylinux_2_28_${{ matrix.arch }}" \ || { echo "ERROR: wheel is not tagged manylinux_2_28_${{ matrix.arch }}"; exit 1; } - python -m auditwheel show dist/*.whl + python3.12 -m auditwheel show dist/*.whl - name: Upload wheel artifact uses: actions/upload-artifact@v6 From a27443ce17d0a5eed710ae82696446a730d04b9c Mon Sep 17 00:00:00 2001 From: tongke Date: Mon, 22 Jun 2026 20:47:40 +0800 Subject: [PATCH 13/14] install gcc13 --- .github/workflows/build-release.yml | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build-release.yml b/.github/workflows/build-release.yml index 88686f0f..72e593c5 100644 --- a/.github/workflows/build-release.yml +++ b/.github/workflows/build-release.yml @@ -14,6 +14,9 @@ jobs: build-wheel: name: "wheel / ${{ matrix.cuda }} / cp312 / ${{ matrix.arch }}" runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-latest' }} + defaults: + run: + shell: bash strategy: fail-fast: false matrix: @@ -37,7 +40,13 @@ jobs: - name: Install system dependencies run: | - dnf install -y git python3.12 python3.12-devel python3.12-pip + dnf install -y \ + git \ + gcc-toolset-13-gcc \ + gcc-toolset-13-gcc-c++ \ + python3.12 \ + python3.12-devel \ + python3.12-pip dnf clean all - name: Checkout @@ -68,11 +77,17 @@ jobs: - name: Build fat-binary wheel env: + CC: /opt/rh/gcc-toolset-13/root/usr/bin/gcc + CXX: /opt/rh/gcc-toolset-13/root/usr/bin/g++ + CUDAHOSTCXX: /opt/rh/gcc-toolset-13/root/usr/bin/g++ CULA_BUILD_ALL_ARCHS: "1" SETUPTOOLS_SCM_PRETEND_VERSION: "${{ steps.version.outputs.version }}" NVCC_THREADS: "4" MAX_JOBS: "4" - run: python3.12 -m build --wheel --no-isolation --outdir dist-raw + run: | + "$CC" --version + "$CXX" --version + python3.12 -m build --wheel --no-isolation --outdir dist-raw - name: Repair wheel for manylinux_2_28 run: | From 44ab4013d9e7d14d12e1a29852e78e0b1ab9c08a Mon Sep 17 00:00:00 2001 From: tongke Date: Mon, 22 Jun 2026 22:52:36 +0800 Subject: [PATCH 14/14] Load CUDA extension matching current GPU architecture Select the per-architecture CUDA extension from the active device compute capability instead of scanning every built extension. SM100/SM103 now load the SM100 extension, while SM90 loads the SM90 extension. This avoids exposing kernels from mismatched GPU architectures and reports clearer errors when the matching extension is missing or unsupported. --- cula/cudac.py | 74 +++++++++++++++++++++++++-------------------------- 1 file changed, 36 insertions(+), 38 deletions(-) diff --git a/cula/cudac.py b/cula/cudac.py index 23f96c97..28fb5f38 100644 --- a/cula/cudac.py +++ b/cula/cudac.py @@ -19,23 +19,41 @@ ``cula_cuda.chunk_kda_fwd_intra_cuda(...)`` without knowing which extension provides the function. -Loading is **once per process**: the first attribute access triggers a -single threaded scan of every built ``cula._cudac_sm*`` extension; the -discovered callables are then cached on the module instance and no -further re-scan happens. Installing or rebuilding an extension after a -process has already imported ``cula.cudac`` will therefore not be picked -up -- callers that need a freshly built extension must restart Python. +Loading is **once per process**: the first attribute access checks the +currently active CUDA device, imports the matching ``cula._cudac_sm*`` +extension, and caches the discovered callables on the module instance. +Changing the active CUDA device to a different architecture after a +process has already loaded ``cula.cudac`` will therefore not be picked +up -- callers that need a different extension must restart Python. """ import importlib import sys import threading -import warnings from types import ModuleType +def _current_device_extension() -> tuple[str, str]: + try: + import torch + except ImportError as exc: + raise ImportError("cuLA CUDA extensions require PyTorch to detect the current GPU.") from exc + + if not torch.cuda.is_available(): + raise RuntimeError("cuLA CUDA extensions require a visible CUDA GPU, but torch.cuda.is_available() is False.") + + device = torch.cuda.current_device() + prop = torch.cuda.get_device_properties(device) + sm_label = f"sm_{prop.major}{prop.minor}" + if prop.major == 10 and prop.minor in (0, 3): + return "cula._cudac_sm100", sm_label + if prop.major == 9 and prop.minor == 0: + return "cula._cudac_sm90", sm_label + raise RuntimeError(f"Unsupported CUDA compute capability {sm_label}. Supported architectures: sm_100, sm_103, sm_90.") + + class _CudacProxy(ModuleType): - """Lazy proxy that exposes functions from all built arch extensions.""" + """Lazy proxy that exposes functions from the current GPU arch extension.""" def __init__(self): super().__init__(__name__) @@ -50,38 +68,18 @@ def _load(self): with self._lock: if self._modules_loaded: return - loaded_any = False - errors: dict[str, Exception] = {} - # pybind11 extensions surface missing-symbol / ABI / libcudart - # failures as AttributeError or OSError at import time rather - # than ImportError, so catch the broader set to keep matching - # the c955d47 intent of surfacing every per-extension failure. - for ext_name in ("cula._cudac_sm100", "cula._cudac_sm90"): - try: - mod = importlib.import_module(ext_name) - for attr in dir(mod): - if not attr.startswith("_"): - self._funcs[attr] = getattr(mod, attr) - loaded_any = True - except (ImportError, AttributeError, OSError) as exc: - errors[ext_name] = exc - if not loaded_any: - details = "; ".join(f"{name}: {exc}" for name, exc in errors.items()) + ext_name, sm_label = _current_device_extension() + try: + mod = importlib.import_module(ext_name) + for attr in dir(mod): + if not attr.startswith("_"): + self._funcs[attr] = getattr(mod, attr) + except (ImportError, AttributeError, OSError) as exc: raise ImportError( - "None of the cuLA CUDA extensions could be imported. " - f"Per-extension errors: [{details}]. " + f"The cuLA CUDA extension for the current GPU ({sm_label}) could not be imported. " + f"Extension {ext_name} failed with: {exc}. " "Please make sure cuLA is compiled correctly." - ) - # Partial failures are not fatal (each surviving extension is - # usable), but the user still needs to know which kernel sets - # are missing so they can diagnose a partial / mismatched build. - if errors: - details = "; ".join(f"{name}: {exc}" for name, exc in errors.items()) - warnings.warn( - "Some cuLA CUDA extensions could not be imported and their " - f"kernels are unavailable. Per-extension errors: [{details}].", - stacklevel=2, - ) + ) from exc self.__dict__.update(self._funcs) self._modules_loaded = True