diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
index aa8e18d..6909fc1 100644
--- a/.github/workflows/wheels.yml
+++ b/.github/workflows/wheels.yml
@@ -4,22 +4,19 @@ on:
   workflow_dispatch:
 
 jobs:
-  build:
+  build-local:
     strategy:
       fail-fast: false
       matrix:
         os:
           - ubuntu-22.04
           # - windows-2019
-        python:
-          - "3.9"
-          - "3.10"
-          - "3.11"
-          - "3.12"
-        torch_version:
-          - "2.7.0"
-        cuda_short_version:
-          - "126"
+        python: ['3.10', '3.11', '3.12']
+        torch_version: ['2.7.0', '2.8.0', '2.9.0']
+        cuda_short_version: ['126']
+        exclude:
+          - torch_version: '2.9.0'
+            python: '3.9'
 
     uses: ./.github/workflows/wheels_build.yml
     with:
@@ -28,44 +25,82 @@ jobs:
       torch_version: ${{ matrix.torch_version }}
       cuda_short_version: ${{ matrix.cuda_short_version }}
 
+  build-pypi:
+    # Single canonical build intended for PyPI: no local CUDA/torch suffix
+    strategy:
+      fail-fast: false
+      matrix:
+        os: ['ubuntu-22.04']
+        python: ['3.10', '3.11', '3.12']
+
+    uses: ./.github/workflows/wheels_build.yml
+    with:
+      os: ${{ matrix.os }}
+      python: ${{ matrix.python }}
+      torch_version: '2.9.0'
+      cuda_short_version: '128'
+      append_local_version: '0'  # 0 to disable local version suffix
+
   # publish to GitHub Release
-  gh_release:
-    name: gh_release
-    needs: build
-    runs-on: ubuntu-20.04
-
-    timeout-minutes: 360
-    defaults:
-      run:
-        shell: bash
+  # gh_release:
+  #   name: gh_release
+  #   needs: build
+  #   runs-on: ubuntu-20.04
+
+  #   timeout-minutes: 360
+  #   defaults:
+  #     run:
+  #       shell: bash
+  #   steps:
+  #     - uses: actions/download-artifact@v4
+  #       with:
+  #         path: dist
+
+  #     - run: ls -R dist/
+
+  #     # create night release if it's a push to main
+  #     - if: github.event_name == 'push' && github.ref == 'refs/heads/main'
+  #       name: Nightly Release
+  #       uses: andelf/nightly-release@v1
+  #       env:
+  #         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+  #       with:
+  #         tag_name: nightly
+  #         name: 'stable-fast Nightly Release $$'
+  #         prerelease: true
+  #         body: 'TODO: Add nightly release notes'
+  #         files: |
+  #           dist/*/*.whl
+
+  #     # create release if it's a tag like vx.y.z
+  #     - if: github.ref_type == 'tag' && startsWith(github.ref, 'refs/tags/v')
+  #       name: Release
+  #       uses: softprops/action-gh-release@v1
+  #       with :
+  #         files: |
+  #           dist/*/*.whl
+
+
+  consolidate-wheels:
+    needs: [build-local, build-pypi]
+    runs-on: ubuntu-latest
     steps:
-      - uses: actions/download-artifact@v4
+      - name: Download all wheel artifacts
+        uses: actions/download-artifact@v4
         with:
           path: dist
 
-      - run: ls -R dist/
+      - name: Consolidate wheels into a single folder
+        run: |
+          mkdir -p consolidated_wheels
+          find dist -name '*.whl' -exec cp {} consolidated_wheels/ \;
+          ls -l consolidated_wheels
 
-      # create night release if it's a push to main
-      - if: github.event_name == 'push' && github.ref == 'refs/heads/main'
-        name: Nightly Release
-        uses: andelf/nightly-release@v1
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Upload consolidated wheels
+        uses: actions/upload-artifact@v4
         with:
-          tag_name: nightly
-          name: 'stable-fast Nightly Release $$'
-          prerelease: true
-          body: 'TODO: Add nightly release notes'
-          files: |
-            dist/*/*.whl
-
-      # create release if it's a tag like vx.y.z
-      - if: github.ref_type == 'tag' && startsWith(github.ref, 'refs/tags/v')
-        name: Release
-        uses: softprops/action-gh-release@v1
-        with :
-          files: |
-            dist/*/*.whl
+          name: built-wheels
+          path: consolidated_wheels
 
   # upload_pip:
   #   needs: build
diff --git a/.github/workflows/wheels_build.yml b/.github/workflows/wheels_build.yml
index 0872838..8570da9 100644
--- a/.github/workflows/wheels_build.yml
+++ b/.github/workflows/wheels_build.yml
@@ -17,6 +17,11 @@ on:
         required: true
         type: string
         description: "Example: 117 for 11.7"
+      append_local_version:
+        required: false
+        type: string
+        default: '1'
+        description: "Set to '0' to disable SFAST local version suffix"
       cudnn_version_major:
         required: false
         default: '8'
@@ -38,6 +43,11 @@ on:
         required: true
         type: string
         description: "Example: 117 for 11.7"
+      append_local_version:
+        required: false
+        type: string
+        default: '1'
+        description: "Set to '0' to disable SFAST local version suffix"
       cudnn_version_major:
         required: false
         default: '8'
@@ -54,7 +64,7 @@ env:
   TORCH_CUDA_ARCH_LIST: "6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX"
   MAX_JOBS: 2
   DISTUTILS_USE_SDK: 1 # otherwise distutils will complain on windows about multiple versions of msvc
-  SFAST_APPEND_VERSION: 1
+  SFAST_APPEND_VERSION: ${{ inputs.append_local_version }}
   TWINE_USERNAME: __token__
 
 jobs:
@@ -81,12 +91,14 @@ jobs:
           cushort = "${{ inputs.cuda_short_version }}"
           # https://github.com/Jimver/cuda-toolkit/blob/master/src/links/linux-links.ts
           full_version, install_script = {
+            "128": ("12.8.0", "https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_570.86.10_linux.run"),
             "126": ("12.6.0", "https://developer.download.nvidia.com/compute/cuda/12.6.0/local_installers/cuda_12.6.0_560.28.03_linux.run"),
             "118": ("11.8.0", "https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run"),
             "117": ("11.7.1", "https://developer.download.nvidia.com/compute/cuda/11.7.1/local_installers/cuda_11.7.1_515.65.01_linux.run"),
             "116": ("11.6.2", "https://developer.download.nvidia.com/compute/cuda/11.6.2/local_installers/cuda_11.6.2_510.47.03_linux.run"),
           }[cushort]
           cudnn_pypi_package = {
+            "128": "nvidia-cudnn-cu12",
             "126": "nvidia-cudnn-cu12",
             "121": "nvidia-cudnn-cu12",
             "118": "nvidia-cudnn-cu11",
@@ -184,17 +196,9 @@ jobs:
           set -Eeuo pipefail
           git config --global --add safe.directory "*"
           version=`cat version.txt`
-          torch_version_suffix=torch$(echo ${{ inputs.torch_version }} | sed 's/\.//g')
-          cuda_version_suffix=${{ steps.cuda_info.outputs.CUDA_VERSION_SUFFIX }}
-          nightly_tag=$([[ ${VERSION_SOURCE} == 'tag' ]] && echo '' || echo '.dev'`date +%Y%m%d`)
-          
-          if [[ "${{ inputs.cuda_short_version }}" == "126" ]]; then
-            echo "BUILD_VERSION=${version}" >> ${GITHUB_ENV}
-            echo "BUILD_VERSION=${version}" >> ${GITHUB_OUTPUT}
-          else
-            echo "BUILD_VERSION=${version}+${torch_version_suffix}${cuda_version_suffix}" >> ${GITHUB_ENV}
-            echo "BUILD_VERSION=${version}+${torch_version_suffix}${cuda_version_suffix}" >> ${GITHUB_OUTPUT}
-          fi
+          echo "BUILD_VERSION=${version}" >> ${GITHUB_ENV}
+          echo "BUILD_VERSION=${version}" >> ${GITHUB_OUTPUT}
+
       - run: echo "sfast-${BUILD_VERSION}"
       - run: echo "release version"
         if: ${{ !contains(steps.sfast_version.outputs.BUILD_VERSION, '.dev') }}
@@ -218,7 +222,7 @@ jobs:
           CUDNN_PYPI_PACKAGE: ${{ steps.cuda_info.outputs.CUDNN_PYPI_PACKAGE }}
         run: |
           cudnn_next_version_major=$((${CUDNN_VERSION_MAJOR} + 1))
-          cudnn_package_name="${CUDNN_PYPI_PACKAGE}>=${CUDNN_VERSION_MAJOR}.0.0.0,<9.6.0.0"
+          cudnn_package_name="${CUDNN_PYPI_PACKAGE}>=${CUDNN_VERSION_MAJOR}.0.0.0,<=9.10.2.21"
           $PY -m pip install --upgrade pip
           $PY -m pip install wheel setuptools ninja twine "torch==${{ inputs.torch_version }}" "${cudnn_package_name}" -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cu${{ inputs.cuda_short_version }} --no-cache-dir
 
@@ -229,8 +233,18 @@ jobs:
           PLAT_ARG: ${{ contains(inputs.os, 'ubuntu') && '--plat-name manylinux2014_x86_64' || '' }}
 
       - run: du -h dist/*
-      - uses: actions/upload-artifact@v4
+
+      - name: Upload artifact (local build)
+        if: ${{ inputs.append_local_version != '0' }}
+        uses: actions/upload-artifact@v4
         with:
           name: ${{ inputs.os }}-py${{ inputs.python }}-torch${{ inputs.torch_version }}+cu${{ inputs.cuda_short_version }}
           path: dist/*.whl
+
+      - name: Upload artifact (pypi build)
+        if: ${{ inputs.append_local_version == '0' }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ inputs.os }}-py${{ inputs.python }}
+          path: dist/*.whl
 # Note: it might be helpful to have additional steps that test if the built wheels actually work
diff --git a/setup.py b/setup.py
index 9a6e99a..ee6dfe1 100644
--- a/setup.py
+++ b/setup.py
@@ -1,16 +1,16 @@
 #!/usr/bin/env python
 
 import glob
+import os
 import platform
 import subprocess
-import os
 
 # import shutil
 from os import path
-from setuptools import find_packages, setup
 
 # from typing import List
 import torch
+from setuptools import find_packages, setup
 from torch.utils.cpp_extension import CUDA_HOME, CUDNN_HOME, CppExtension, CUDAExtension
 
 torch_ver = [int(x) for x in torch.__version__.split(".")[:2]]
@@ -19,7 +19,36 @@
 
 def fetch_requirements():
     with open("requirements.txt") as f:
-        reqs = f.read().strip().split("\n")
+        lines = f.read().strip().split("\n")
+
+    # Drop empty/comment lines
+    base_reqs = []
+    for line in lines:
+        stripped = line.strip()
+        if not stripped or stripped.startswith("#"):
+            continue
+        base_reqs.append(stripped)
+
+    # Constrain torch to the major.minor of the torch that was used to build
+    # the wheel, e.g. torch 2.7.0 -> "torch>=2.7,<2.8".
+    major, minor = torch_ver[0], torch_ver[1]
+    torch_constraint = f"torch>={major}.{minor},<{major}.{minor + 1}"
+
+    reqs = []
+    torch_added = False
+    for req in base_reqs:
+        # Replace any existing torch specification with our constrained one.
+        if req.split()[0].startswith("torch"):
+            if not torch_added:
+                reqs.append(torch_constraint)
+                torch_added = True
+            # Skip additional torch lines
+        else:
+            reqs.append(req)
+
+    if not torch_added:
+        reqs.append(torch_constraint)
+
     return reqs
 
 
@@ -41,6 +70,42 @@ def get_version():
         date_str = datetime.today().strftime("%y%m%d")
         version = version + ".dev" + date_str
 
+    # Optionally append the CUDA and torch version used to build this
+    # wheel as a local version suffix, e.g. "+cu126.torch2.7".
+    if os.getenv("SFAST_APPEND_VERSION", "0") not in (
+        "0",
+        "",
+        "false",
+        "False",
+    ):
+        torch_version_str = os.getenv("TORCH_VERSION_FOR_BUILD", torch.__version__)
+
+        # torch_version_str is typically like "2.7.0+cu126" or "2.7.0".
+        cuda_token = ""
+        base_torch = torch_version_str
+        if "+" in torch_version_str:
+            base_torch, _, local_torch = torch_version_str.partition("+")
+            # Take the first token from the local version, e.g. "cu126".
+            cuda_token = local_torch.split(".")[0]
+
+        base_parts = base_torch.split(".")
+        if len(base_parts) >= 2:
+            torch_major_minor = ".".join(base_parts[:2])
+        else:
+            torch_major_minor = base_torch
+
+        local_suffix = f"torch{torch_major_minor}"
+        if cuda_token:
+            local_suffix = f"{cuda_token}.{local_suffix}"
+
+        # Ensure we only ever have a single '+' in the version; if there is
+        # already a local segment, extend it with ".cuXXX.torchY.Z".
+        if "+" in version:
+            base_v, _, local_v = version.partition("+")
+            version = f"{base_v}+{local_v}.{local_suffix}"
+        else:
+            version = f"{version}+{local_suffix}"
+
     init_py_path = path.join(this_dir, "src", "sfast", "__init__.py")
     init_py = open(init_py_path, "r").readlines()
     new_init_py = [l for l in init_py if not l.startswith("__version__")]
@@ -52,8 +117,7 @@ def get_version():
 
 def get_cuda_version(cuda_dir) -> int:
     nvcc_bin = "nvcc" if cuda_dir is None else cuda_dir + "/bin/nvcc"
-    raw_output = subprocess.check_output([nvcc_bin, "-V"],
-                                         universal_newlines=True)
+    raw_output = subprocess.check_output([nvcc_bin, "-V"], universal_newlines=True)
     output = raw_output.split()
     release_idx = output.index("release") + 1
     release = output[release_idx].split(".")
@@ -69,13 +133,10 @@ def get_extensions():
     extensions_dir = path.join(this_dir, "src", "sfast", "csrc")
     include_dirs = [extensions_dir]
 
-    sources = glob.glob(path.join(extensions_dir, "**", "*.cpp"),
-                        recursive=True)
+    sources = glob.glob(path.join(extensions_dir, "**", "*.cpp"), recursive=True)
     # common code between cuda and rocm platforms, for hipify version [1,0,0] and later.
-    source_cuda = glob.glob(path.join(extensions_dir, "**", "*.cu"),
-                            recursive=True)
-    source_cuda_rt = glob.glob(path.join(extensions_dir, "**", "*.cc"),
-                               recursive=True)
+    source_cuda = glob.glob(path.join(extensions_dir, "**", "*.cu"), recursive=True)
+    source_cuda_rt = glob.glob(path.join(extensions_dir, "**", "*.cc"), recursive=True)
 
     extension = CppExtension
 
@@ -89,19 +150,25 @@ def get_extensions():
     # Skip the above useless check as we will always compile with CUDA support,
     # and the CI might be running on CPU-only machines.
     if platform.system() != "Darwin" and os.getenv("WITH_CUDA", "1") != "0":
-        assert CUDA_HOME is not None, "Cannot find CUDA installation. If you want to compile without CUDA, set `WITH_CUDA=0`."
+        assert CUDA_HOME is not None, (
+            "Cannot find CUDA installation. If you want to compile without CUDA, set `WITH_CUDA=0`."
+        )
 
         cutlass_root = os.path.join(this_dir, "third_party", "cutlass")
         cutlass_include = os.path.join(cutlass_root, "include")
-        if not os.path.exists(cutlass_root) or not os.path.exists(
-                cutlass_include):
-            raise RuntimeError("Cannot find cutlass. Please run "
-                               "`git submodule update --init --recursive`.")
+        if not os.path.exists(cutlass_root) or not os.path.exists(cutlass_include):
+            raise RuntimeError(
+                "Cannot find cutlass. Please run "
+                "`git submodule update --init --recursive`."
+            )
         include_dirs.append(cutlass_include)
-        cutlass_tools_util_include = os.path.join(cutlass_root, "tools",
-                                                  "util", "include")
+        cutlass_tools_util_include = os.path.join(
+            cutlass_root, "tools", "util", "include"
+        )
         include_dirs.append(cutlass_tools_util_include)
-        cutlass_examples_dual_gemm = os.path.join(cutlass_root, "examples", "45_dual_gemm")
+        cutlass_examples_dual_gemm = os.path.join(
+            cutlass_root, "examples", "45_dual_gemm"
+        )
         include_dirs.append(cutlass_examples_dual_gemm)
 
         extension = CUDAExtension
@@ -155,6 +222,11 @@ def get_extensions():
                 extra_compile_args["nvcc"].append("-ccbin={}".format(CC))
 
         if CUDNN_HOME is None:
+            # Prefer PyTorch's own toolchain for cuDNN / cuBLAS when possible.
+            # For older NVIDIA wheels (where nvidia.cudnn/cublas had a real __file__)
+            # we still support discovering headers via that path, but newer PyTorch
+            # / NVIDIA packaging may expose these as non file-backed modules, in which
+            # case we simply fall back to PyTorch/CUDA's default include/library dirs.
             try:
                 # Try to use the bundled version of CUDNN with PyTorch installation.
                 # This is also used in CI.
@@ -162,8 +234,19 @@ def get_extensions():
             except ImportError:
                 cudnn = None
 
+            cudnn_dir = None
             if cudnn is not None:
-                cudnn_dir = os.path.dirname(cudnn.__file__)
+                # Prefer __file__ for regular packages; fall back to __path__ for
+                # namespace-style packages used by newer NVIDIA wheels.
+                cudnn_file = getattr(cudnn, "__file__", None)
+                if cudnn_file:
+                    cudnn_dir = os.path.dirname(cudnn_file)
+                else:
+                    cudnn_paths = getattr(cudnn, "__path__", None)
+                    if cudnn_paths:
+                        cudnn_dir = list(cudnn_paths)[0]
+
+            if cudnn_dir is not None:
                 print("Using CUDNN from {}".format(cudnn_dir))
                 include_dirs.append(os.path.join(cudnn_dir, "include"))
                 # Hope PyTorch knows how to link it correctly.
@@ -181,8 +264,17 @@ def get_extensions():
             except ImportError:
                 cublas = None
 
+            cublas_dir = None
             if cublas is not None:
-                cublas_dir = os.path.dirname(cublas.__file__)
+                cublas_file = getattr(cublas, "__file__", None)
+                if cublas_file:
+                    cublas_dir = os.path.dirname(cublas_file)
+                else:
+                    cublas_paths = getattr(cublas, "__path__", None)
+                    if cublas_paths:
+                        cublas_dir = list(cublas_paths)[0]
+
+            if cublas_dir is not None:
                 print("Using CUBLAS from {}".format(cublas_dir))
                 include_dirs.append(os.path.join(cublas_dir, "include"))
                 # Hope PyTorch knows how to link it correctly.
@@ -222,13 +314,12 @@ def get_extensions():
     version=get_version(),
     author="Cheng Zeyi",
     url="https://github.com/chengzeyi/stable-fast",
-    description=
-    "Stable Fast is an ultra lightweight performance optimization framework"
+    description="Stable Fast is an ultra lightweight performance optimization framework"
     " for Hugging Fase diffuser pipelines.",
     package_dir={
-        '': 'src',
+        "": "src",
     },
-    packages=find_packages(where='src'),
+    packages=find_packages(where="src"),
     # include submodules in third_party
     python_requires=">=3.7",
     install_requires=fetch_requirements(),
diff --git a/version.txt b/version.txt
index 238d6e8..b0f3d96 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-1.0.7
+1.0.8