diff --git a/.github/scripts/cuda-setup.sh b/.github/scripts/cuda-setup.sh
new file mode 100644
index 00000000..c1722f73
--- /dev/null
+++ b/.github/scripts/cuda-setup.sh
@@ -0,0 +1,56 @@
+#!/usr/bin/env bash
+# Shared CUDA environment setup for GPU CI and bench workflows.
+# Finds the CUDA toolkit, detects the GPU's compute capability, and
+# exports env vars that candle-kernels/build.rs needs to compile WMMA
+# kernels to matching SASS.
+#
+# Exports (via $GITHUB_ENV / $GITHUB_PATH):
+#   PATH            — prepends <cuda>/bin
+#   CUDA_PATH       — root of the CUDA installation
+#   LIBRARY_PATH    — <cuda>/lib64
+#   LD_LIBRARY_PATH — <cuda>/lib64
+#   INFERRS_WMMA_ARCH  — numeric compute cap (e.g. 120)
+#   CUDA_COMPUTE_CAP   — same value, used by bindgen_cuda
+set -euo pipefail
+
+# ---------- Find CUDA installation ----------
+CUDA_FOUND=""
+for d in /usr/local/cuda /opt/cuda /usr; do
+  if [ -x "$d/bin/nvcc" ]; then
+    echo "Found CUDA at $d"
+    echo "$d/bin" >> "$GITHUB_PATH"
+    echo "CUDA_PATH=$d" >> "$GITHUB_ENV"
+    echo "LIBRARY_PATH=$d/lib64:${LIBRARY_PATH:-}" >> "$GITHUB_ENV"
+    echo "LD_LIBRARY_PATH=$d/lib64:${LD_LIBRARY_PATH:-}" >> "$GITHUB_ENV"
+    CUDA_FOUND="$d"
+    break
+  fi
+done
+if [ -z "$CUDA_FOUND" ]; then
+  echo "::error::No CUDA installation found (checked /usr/local/cuda, /opt/cuda, /usr)"
+  exit 1
+fi
+"$CUDA_FOUND/bin/nvcc" --version
+
+# ---------- Detect GPU compute capability ----------
+RAW=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -1 | tr -d '.')
+echo "Detected GPU compute capability: sm_${RAW}"
+echo "INFERRS_WMMA_ARCH=${RAW}" >> "$GITHUB_ENV"
+echo "CUDA_COMPUTE_CAP=${RAW}" >> "$GITHUB_ENV"
+
+# ---------- GPU info ----------
+nvidia-smi
+nvidia-smi --query-gpu=name,compute_cap,memory.total,driver_version --format=csv
+
+# ---------- Blackwell nvcc version check ----------
+# sm_120+ requires CUDA toolkit >= 12.8.
+NVCC_VER=$("$CUDA_FOUND/bin/nvcc" --version | grep -o 'release [0-9]*\.[0-9]*' | awk '{print $2}')
+echo "nvcc version: ${NVCC_VER}"
+if [ "${RAW}" -ge 120 ]; then
+  MAJOR=$(echo "$NVCC_VER" | cut -d. -f1)
+  MINOR=$(echo "$NVCC_VER" | cut -d. -f2)
+  if [ "$MAJOR" -lt 12 ] || { [ "$MAJOR" -eq 12 ] && [ "$MINOR" -lt 8 ]; }; then
+    echo "::error::sm_${RAW} (Blackwell) requires CUDA >= 12.8 but found ${NVCC_VER}"
+    exit 1
+  fi
+fi
diff --git a/.github/workflows/gpu-bench.yml b/.github/workflows/gpu-bench.yml
new file mode 100644
index 00000000..68a83475
--- /dev/null
+++ b/.github/workflows/gpu-bench.yml
@@ -0,0 +1,87 @@
+name: GPU Bench
+
+on:
+  workflow_dispatch:
+    inputs:
+      model:
+        description: 'HuggingFace model ID or local path on runner'
+        required: true
+      prompt_len:
+        description: 'Number of synthetic prompt tokens'
+        required: false
+        default: '128'
+      runs:
+        description: 'Number of timed benchmark runs'
+        required: false
+        default: '5'
+      profile:
+        description: 'Run nsys profiling (true/false)'
+        required: false
+        default: 'false'
+
+env:
+  CARGO_TERM_COLOR: always
+
+jobs:
+  bench:
+    name: Benchmark (nvidia-runner-1)
+    runs-on: [self-hosted, Linux, X64, cuda]
+    timeout-minutes: 120
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Install system dependencies
+        run: |
+          dpkg -s pkg-config libssl-dev &>/dev/null || \
+            { sudo apt-get update -q && sudo apt-get install -y --no-install-recommends pkg-config libssl-dev; }
+
+      - name: Set up CUDA + detect GPU
+        run: bash .github/scripts/cuda-setup.sh
+
+      - uses: dtolnay/rust-toolchain@stable
+
+      - uses: Swatinem/rust-cache@v2
+        with:
+          key: gpu-bench
+
+      - name: Build (release, CUDA backend)
+        run: cargo build --release -p inferrs -p inferrs-backend-cuda
+
+      - name: Run inferrs bench
+        run: |
+          cargo run --release -p inferrs -- bench \
+            --model "${{ inputs.model }}" \
+            --warmup 1 \
+            --runs "${{ inputs.runs }}" \
+            --prompt-len "${{ inputs.prompt_len }}"
+
+      - name: Run nsys profile
+        if: inputs.profile == 'true'
+        run: |
+          nsys profile \
+            --output /tmp/inferrs-bench-trace \
+            --force-overwrite true \
+            --trace cuda,nvtx \
+            --sample none \
+            ./target/release/inferrs bench \
+              --model "${{ inputs.model }}" \
+              --warmup 0 \
+              --runs 1 \
+              --prompt-len "${{ inputs.prompt_len }}"
+
+      - name: Upload nsys trace
+        if: inputs.profile == 'true'
+        uses: actions/upload-artifact@v4
+        with:
+          name: nsys-trace
+          path: /tmp/inferrs-bench-trace.*
+          retention-days: 14
+
+      - name: Kernel-level stats (nsys stats)
+        if: inputs.profile == 'true'
+        run: |
+          nsys stats /tmp/inferrs-bench-trace.nsys-rep \
+            --report gpukernsum \
+            --format csv \
+            --output /tmp/inferrs-kernel-stats || true
+          cat /tmp/inferrs-kernel-stats*.csv 2>/dev/null || echo "nsys stats not available"
diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml
index 6e563a12..de896a8b 100644
--- a/.github/workflows/gpu-ci.yml
+++ b/.github/workflows/gpu-ci.yml
@@ -20,25 +20,11 @@ jobs:
 
       - name: Install system dependencies
         run: |
-          sudo apt-get update -q
-          sudo apt-get install -y --no-install-recommends \
-            pkg-config libssl-dev
+          dpkg -s pkg-config libssl-dev &>/dev/null || \
+            { sudo apt-get update -q && sudo apt-get install -y --no-install-recommends pkg-config libssl-dev; }
 
-      - name: Set up CUDA environment
-        run: |
-          # Find the CUDA installation (common paths on self-hosted runners)
-          for d in /usr/local/cuda /opt/cuda /usr; do
-            if [ -x "$d/bin/nvcc" ]; then
-              echo "Found CUDA at $d"
-              echo "$d/bin" >> "$GITHUB_PATH"
-              echo "CUDA_PATH=$d" >> "$GITHUB_ENV"
-              echo "LIBRARY_PATH=$d/lib64:${LIBRARY_PATH:-}" >> "$GITHUB_ENV"
-              echo "LD_LIBRARY_PATH=$d/lib64:${LD_LIBRARY_PATH:-}" >> "$GITHUB_ENV"
-              NVCC="$d/bin/nvcc"
-              break
-            fi
-          done
-          "${NVCC:-nvcc}" --version
+      - name: Set up CUDA + detect GPU
+        run: bash .github/scripts/cuda-setup.sh
 
       - uses: dtolnay/rust-toolchain@stable
 
@@ -46,13 +32,6 @@ jobs:
         with:
           key: gpu-ci
 
-      - name: Show GPU info
-        run: nvidia-smi
-
-      # Let bindgen_cuda auto-detect the GPU's compute capability via
-      # nvidia-smi.  The WMMA .cu files now compile to no-op stubs on
-      # SM < 7.0 (via INFERRS_NO_WMMA), so this works on Pascal (6.1)
-      # as well as Volta+ GPUs.
       - name: Build (with CUDA backend)
         run: cargo build -p inferrs -p inferrs-backend-cuda