From dfee794d87f7d9eb26f2d3a9a2caa99dd1f6fe57 Mon Sep 17 00:00:00 2001
From: Rob Taylor <rob.taylor@chipflow.io>
Date: Wed, 15 Apr 2026 21:14:33 +0100
Subject: [PATCH 1/2] ci: auto-detect GPU arch for WMMA kernels, add bench
 workflow

gpu-ci.yml: detect the runner's compute capability via nvidia-smi and
export INFERRS_WMMA_ARCH + CUDA_COMPUTE_CAP so candle-kernels/build.rs
compiles WMMA SASS matching the actual GPU. The previous default
(sm_80) only runs on Ampere; the RTX 5060 Ti is sm_120 (Blackwell).
Adds an early check that nvcc >= 12.8 when targeting sm_120+.

gpu-bench.yml: new workflow_dispatch workflow for running `inferrs
bench` and optional nsys profiling on the self-hosted GPU runner.
Produces timing stats and uploads nsys traces as artifacts.

Co-developed-by: Claude Code v2.1.104 (claude-opus-4-6)
---
 .github/workflows/gpu-bench.yml | 114 ++++++++++++++++++++++++++++++++
 .github/workflows/gpu-ci.yml    |  30 +++++++--
 2 files changed, 140 insertions(+), 4 deletions(-)
 create mode 100644 .github/workflows/gpu-bench.yml

diff --git a/.github/workflows/gpu-bench.yml b/.github/workflows/gpu-bench.yml
new file mode 100644
index 00000000..08846069
--- /dev/null
+++ b/.github/workflows/gpu-bench.yml
@@ -0,0 +1,114 @@
+name: GPU Bench
+
+on:
+  # Manual trigger with configurable model
+  workflow_dispatch:
+    inputs:
+      model:
+        description: 'HuggingFace model ID or local path on runner'
+        required: false
+        default: ''
+      prompt_len:
+        description: 'Number of synthetic prompt tokens'
+        required: false
+        default: '128'
+      runs:
+        description: 'Number of timed benchmark runs'
+        required: false
+        default: '5'
+      profile:
+        description: 'Run nsys profiling (true/false)'
+        required: false
+        default: 'false'
+
+env:
+  CARGO_TERM_COLOR: always
+
+jobs:
+  bench:
+    name: Benchmark (nvidia-runner-1)
+    runs-on: [self-hosted, Linux, X64, cuda]
+    timeout-minutes: 120
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Install system dependencies
+        run: |
+          sudo apt-get update -q
+          sudo apt-get install -y --no-install-recommends \
+            pkg-config libssl-dev
+
+      - name: Set up CUDA environment
+        run: |
+          for d in /usr/local/cuda /opt/cuda /usr; do
+            if [ -x "$d/bin/nvcc" ]; then
+              echo "Found CUDA at $d"
+              echo "$d/bin" >> "$GITHUB_PATH"
+              echo "CUDA_PATH=$d" >> "$GITHUB_ENV"
+              echo "LIBRARY_PATH=$d/lib64:${LIBRARY_PATH:-}" >> "$GITHUB_ENV"
+              echo "LD_LIBRARY_PATH=$d/lib64:${LD_LIBRARY_PATH:-}" >> "$GITHUB_ENV"
+              break
+            fi
+          done
+
+      - name: Detect GPU compute capability
+        run: |
+          RAW=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -1 | tr -d '.')
+          echo "Detected GPU compute capability: sm_${RAW}"
+          echo "INFERRS_WMMA_ARCH=${RAW}" >> "$GITHUB_ENV"
+          echo "CUDA_COMPUTE_CAP=${RAW}" >> "$GITHUB_ENV"
+
+      - uses: dtolnay/rust-toolchain@stable
+
+      - uses: Swatinem/rust-cache@v2
+        with:
+          key: gpu-bench
+
+      - name: GPU info
+        run: |
+          nvidia-smi
+          nvidia-smi --query-gpu=name,compute_cap,memory.total,driver_version --format=csv
+
+      - name: Build (release, CUDA backend)
+        run: cargo build --release -p inferrs -p inferrs-backend-cuda
+
+      - name: Run inferrs bench
+        if: inputs.model != ''
+        run: |
+          cargo run --release -p inferrs -- bench \
+            --model "${{ inputs.model }}" \
+            --warmup 1 \
+            --runs "${{ inputs.runs }}" \
+            --prompt-len "${{ inputs.prompt_len }}"
+
+      - name: Run nsys profile
+        if: inputs.profile == 'true' && inputs.model != ''
+        run: |
+          # Single-run nsys trace for kernel-level analysis
+          nsys profile \
+            --output /tmp/inferrs-bench-trace \
+            --force-overwrite true \
+            --trace cuda,nvtx \
+            --sample none \
+            cargo run --release -p inferrs -- bench \
+              --model "${{ inputs.model }}" \
+              --warmup 0 \
+              --runs 1 \
+              --prompt-len "${{ inputs.prompt_len }}"
+
+      - name: Upload nsys trace
+        if: inputs.profile == 'true' && inputs.model != ''
+        uses: actions/upload-artifact@v4
+        with:
+          name: nsys-trace
+          path: /tmp/inferrs-bench-trace.*
+          retention-days: 14
+
+      - name: Kernel-level stats (nsys stats)
+        if: inputs.profile == 'true' && inputs.model != ''
+        run: |
+          nsys stats /tmp/inferrs-bench-trace.nsys-rep \
+            --report gpukernsum \
+            --format csv \
+            --output /tmp/inferrs-kernel-stats || true
+          cat /tmp/inferrs-kernel-stats*.csv 2>/dev/null || echo "nsys stats not available"
diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml
index 6e563a12..ac303d29 100644
--- a/.github/workflows/gpu-ci.yml
+++ b/.github/workflows/gpu-ci.yml
@@ -40,6 +40,29 @@ jobs:
           done
           "${NVCC:-nvcc}" --version
 
+      - name: Detect GPU compute capability
+        run: |
+          # Query the GPU's compute capability and export it so
+          # candle-kernels/build.rs compiles WMMA kernels to matching SASS.
+          # Without this, DEFAULT_WMMA_ARCH=80 produces sm_80 SASS which
+          # only runs on Ampere — not on Blackwell (sm_120) or other arches.
+          RAW=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -1 | tr -d '.')
+          echo "Detected GPU compute capability: sm_${RAW}"
+          echo "INFERRS_WMMA_ARCH=${RAW}" >> "$GITHUB_ENV"
+          echo "CUDA_COMPUTE_CAP=${RAW}" >> "$GITHUB_ENV"
+          # Blackwell (sm_120) requires CUDA toolkit >= 12.8. Warn early
+          # if the toolkit is too old.
+          NVCC_VER=$(nvcc --version | grep -oP 'release \K[0-9]+\.[0-9]+')
+          echo "nvcc version: ${NVCC_VER}"
+          if [ "${RAW}" -ge 120 ]; then
+            MAJOR=$(echo "$NVCC_VER" | cut -d. -f1)
+            MINOR=$(echo "$NVCC_VER" | cut -d. -f2)
+            if [ "$MAJOR" -lt 12 ] || { [ "$MAJOR" -eq 12 ] && [ "$MINOR" -lt 8 ]; }; then
+              echo "::error::sm_${RAW} (Blackwell) requires CUDA >= 12.8 but found ${NVCC_VER}"
+              exit 1
+            fi
+          fi
+
       - uses: dtolnay/rust-toolchain@stable
 
       - uses: Swatinem/rust-cache@v2
@@ -49,10 +72,9 @@ jobs:
       - name: Show GPU info
         run: nvidia-smi
 
-      # Let bindgen_cuda auto-detect the GPU's compute capability via
-      # nvidia-smi.  The WMMA .cu files now compile to no-op stubs on
-      # SM < 7.0 (via INFERRS_NO_WMMA), so this works on Pascal (6.1)
-      # as well as Volta+ GPUs.
+      # INFERRS_WMMA_ARCH and CUDA_COMPUTE_CAP are set above from the
+      # runner's actual GPU so both WMMA and non-WMMA kernels compile to
+      # the correct SASS.
       - name: Build (with CUDA backend)
         run: cargo build -p inferrs -p inferrs-backend-cuda
 

From 840e9e53be029f11469af248d11df6f02dc02f96 Mon Sep 17 00:00:00 2001
From: Rob Taylor <rob.taylor@chipflow.io>
Date: Wed, 15 Apr 2026 23:29:33 +0100
Subject: [PATCH 2/2] refactor(ci): extract shared CUDA setup, fix review
 findings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Extract CUDA path detection, CC detection, and Blackwell nvcc
  version check into .github/scripts/cuda-setup.sh — eliminates
  duplication between gpu-ci.yml and gpu-bench.yml, and closes the
  gap where gpu-bench.yml was missing the Blackwell guard.
- Fail fast when no CUDA installation is found (was silent fallback).
- Replace grep -oP (Perl regex, non-portable) with grep -o + awk.
- Make model input required in gpu-bench.yml (was silently skipping
  bench when empty default was used).
- Use pre-built binary for nsys profiling (avoids redundant link).
- Merge redundant nvidia-smi calls into the shared setup script.
- Guard apt-get install with dpkg -s check for self-hosted runner.

Co-developed-by: Claude Code v2.1.104 (claude-opus-4-6)
---
 .github/scripts/cuda-setup.sh   | 56 +++++++++++++++++++++++++++++++++
 .github/workflows/gpu-bench.yml | 45 ++++++--------------------
 .github/workflows/gpu-ci.yml    | 51 +++---------------------------
 3 files changed, 69 insertions(+), 83 deletions(-)
 create mode 100644 .github/scripts/cuda-setup.sh

diff --git a/.github/scripts/cuda-setup.sh b/.github/scripts/cuda-setup.sh
new file mode 100644
index 00000000..c1722f73
--- /dev/null
+++ b/.github/scripts/cuda-setup.sh
@@ -0,0 +1,56 @@
+#!/usr/bin/env bash
+# Shared CUDA environment setup for GPU CI and bench workflows.
+# Finds the CUDA toolkit, detects the GPU's compute capability, and
+# exports env vars that candle-kernels/build.rs needs to compile WMMA
+# kernels to matching SASS.
+#
+# Exports (via $GITHUB_ENV / $GITHUB_PATH):
+#   PATH            — prepends <cuda>/bin
+#   CUDA_PATH       — root of the CUDA installation
+#   LIBRARY_PATH    — <cuda>/lib64
+#   LD_LIBRARY_PATH — <cuda>/lib64
+#   INFERRS_WMMA_ARCH  — numeric compute cap (e.g. 120)
+#   CUDA_COMPUTE_CAP   — same value, used by bindgen_cuda
+set -euo pipefail
+
+# ---------- Find CUDA installation ----------
+CUDA_FOUND=""
+for d in /usr/local/cuda /opt/cuda /usr; do
+  if [ -x "$d/bin/nvcc" ]; then
+    echo "Found CUDA at $d"
+    echo "$d/bin" >> "$GITHUB_PATH"
+    echo "CUDA_PATH=$d" >> "$GITHUB_ENV"
+    echo "LIBRARY_PATH=$d/lib64:${LIBRARY_PATH:-}" >> "$GITHUB_ENV"
+    echo "LD_LIBRARY_PATH=$d/lib64:${LD_LIBRARY_PATH:-}" >> "$GITHUB_ENV"
+    CUDA_FOUND="$d"
+    break
+  fi
+done
+if [ -z "$CUDA_FOUND" ]; then
+  echo "::error::No CUDA installation found (checked /usr/local/cuda, /opt/cuda, /usr)"
+  exit 1
+fi
+"$CUDA_FOUND/bin/nvcc" --version
+
+# ---------- Detect GPU compute capability ----------
+RAW=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -1 | tr -d '.')
+echo "Detected GPU compute capability: sm_${RAW}"
+echo "INFERRS_WMMA_ARCH=${RAW}" >> "$GITHUB_ENV"
+echo "CUDA_COMPUTE_CAP=${RAW}" >> "$GITHUB_ENV"
+
+# ---------- GPU info ----------
+nvidia-smi
+nvidia-smi --query-gpu=name,compute_cap,memory.total,driver_version --format=csv
+
+# ---------- Blackwell nvcc version check ----------
+# sm_120+ requires CUDA toolkit >= 12.8.
+NVCC_VER=$("$CUDA_FOUND/bin/nvcc" --version | grep -o 'release [0-9]*\.[0-9]*' | awk '{print $2}')
+echo "nvcc version: ${NVCC_VER}"
+if [ "${RAW}" -ge 120 ]; then
+  MAJOR=$(echo "$NVCC_VER" | cut -d. -f1)
+  MINOR=$(echo "$NVCC_VER" | cut -d. -f2)
+  if [ "$MAJOR" -lt 12 ] || { [ "$MAJOR" -eq 12 ] && [ "$MINOR" -lt 8 ]; }; then
+    echo "::error::sm_${RAW} (Blackwell) requires CUDA >= 12.8 but found ${NVCC_VER}"
+    exit 1
+  fi
+fi
diff --git a/.github/workflows/gpu-bench.yml b/.github/workflows/gpu-bench.yml
index 08846069..68a83475 100644
--- a/.github/workflows/gpu-bench.yml
+++ b/.github/workflows/gpu-bench.yml
@@ -1,13 +1,11 @@
 name: GPU Bench
 
 on:
-  # Manual trigger with configurable model
   workflow_dispatch:
     inputs:
       model:
         description: 'HuggingFace model ID or local path on runner'
-        required: false
-        default: ''
+        required: true
       prompt_len:
         description: 'Number of synthetic prompt tokens'
         required: false
@@ -34,29 +32,11 @@ jobs:
 
       - name: Install system dependencies
         run: |
-          sudo apt-get update -q
-          sudo apt-get install -y --no-install-recommends \
-            pkg-config libssl-dev
-
-      - name: Set up CUDA environment
-        run: |
-          for d in /usr/local/cuda /opt/cuda /usr; do
-            if [ -x "$d/bin/nvcc" ]; then
-              echo "Found CUDA at $d"
-              echo "$d/bin" >> "$GITHUB_PATH"
-              echo "CUDA_PATH=$d" >> "$GITHUB_ENV"
-              echo "LIBRARY_PATH=$d/lib64:${LIBRARY_PATH:-}" >> "$GITHUB_ENV"
-              echo "LD_LIBRARY_PATH=$d/lib64:${LD_LIBRARY_PATH:-}" >> "$GITHUB_ENV"
-              break
-            fi
-          done
+          dpkg -s pkg-config libssl-dev &>/dev/null || \
+            { sudo apt-get update -q && sudo apt-get install -y --no-install-recommends pkg-config libssl-dev; }
 
-      - name: Detect GPU compute capability
-        run: |
-          RAW=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -1 | tr -d '.')
-          echo "Detected GPU compute capability: sm_${RAW}"
-          echo "INFERRS_WMMA_ARCH=${RAW}" >> "$GITHUB_ENV"
-          echo "CUDA_COMPUTE_CAP=${RAW}" >> "$GITHUB_ENV"
+      - name: Set up CUDA + detect GPU
+        run: bash .github/scripts/cuda-setup.sh
 
       - uses: dtolnay/rust-toolchain@stable
 
@@ -64,16 +44,10 @@ jobs:
         with:
           key: gpu-bench
 
-      - name: GPU info
-        run: |
-          nvidia-smi
-          nvidia-smi --query-gpu=name,compute_cap,memory.total,driver_version --format=csv
-
       - name: Build (release, CUDA backend)
         run: cargo build --release -p inferrs -p inferrs-backend-cuda
 
       - name: Run inferrs bench
-        if: inputs.model != ''
         run: |
           cargo run --release -p inferrs -- bench \
             --model "${{ inputs.model }}" \
@@ -82,22 +56,21 @@ jobs:
             --prompt-len "${{ inputs.prompt_len }}"
 
       - name: Run nsys profile
-        if: inputs.profile == 'true' && inputs.model != ''
+        if: inputs.profile == 'true'
         run: |
-          # Single-run nsys trace for kernel-level analysis
           nsys profile \
             --output /tmp/inferrs-bench-trace \
             --force-overwrite true \
             --trace cuda,nvtx \
             --sample none \
-            cargo run --release -p inferrs -- bench \
+            ./target/release/inferrs bench \
               --model "${{ inputs.model }}" \
               --warmup 0 \
               --runs 1 \
               --prompt-len "${{ inputs.prompt_len }}"
 
       - name: Upload nsys trace
-        if: inputs.profile == 'true' && inputs.model != ''
+        if: inputs.profile == 'true'
         uses: actions/upload-artifact@v4
         with:
           name: nsys-trace
@@ -105,7 +78,7 @@ jobs:
           retention-days: 14
 
       - name: Kernel-level stats (nsys stats)
-        if: inputs.profile == 'true' && inputs.model != ''
+        if: inputs.profile == 'true'
         run: |
           nsys stats /tmp/inferrs-bench-trace.nsys-rep \
             --report gpukernsum \
diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml
index ac303d29..de896a8b 100644
--- a/.github/workflows/gpu-ci.yml
+++ b/.github/workflows/gpu-ci.yml
@@ -20,48 +20,11 @@ jobs:
 
       - name: Install system dependencies
         run: |
-          sudo apt-get update -q
-          sudo apt-get install -y --no-install-recommends \
-            pkg-config libssl-dev
+          dpkg -s pkg-config libssl-dev &>/dev/null || \
+            { sudo apt-get update -q && sudo apt-get install -y --no-install-recommends pkg-config libssl-dev; }
 
-      - name: Set up CUDA environment
-        run: |
-          # Find the CUDA installation (common paths on self-hosted runners)
-          for d in /usr/local/cuda /opt/cuda /usr; do
-            if [ -x "$d/bin/nvcc" ]; then
-              echo "Found CUDA at $d"
-              echo "$d/bin" >> "$GITHUB_PATH"
-              echo "CUDA_PATH=$d" >> "$GITHUB_ENV"
-              echo "LIBRARY_PATH=$d/lib64:${LIBRARY_PATH:-}" >> "$GITHUB_ENV"
-              echo "LD_LIBRARY_PATH=$d/lib64:${LD_LIBRARY_PATH:-}" >> "$GITHUB_ENV"
-              NVCC="$d/bin/nvcc"
-              break
-            fi
-          done
-          "${NVCC:-nvcc}" --version
-
-      - name: Detect GPU compute capability
-        run: |
-          # Query the GPU's compute capability and export it so
-          # candle-kernels/build.rs compiles WMMA kernels to matching SASS.
-          # Without this, DEFAULT_WMMA_ARCH=80 produces sm_80 SASS which
-          # only runs on Ampere — not on Blackwell (sm_120) or other arches.
-          RAW=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -1 | tr -d '.')
-          echo "Detected GPU compute capability: sm_${RAW}"
-          echo "INFERRS_WMMA_ARCH=${RAW}" >> "$GITHUB_ENV"
-          echo "CUDA_COMPUTE_CAP=${RAW}" >> "$GITHUB_ENV"
-          # Blackwell (sm_120) requires CUDA toolkit >= 12.8. Warn early
-          # if the toolkit is too old.
-          NVCC_VER=$(nvcc --version | grep -oP 'release \K[0-9]+\.[0-9]+')
-          echo "nvcc version: ${NVCC_VER}"
-          if [ "${RAW}" -ge 120 ]; then
-            MAJOR=$(echo "$NVCC_VER" | cut -d. -f1)
-            MINOR=$(echo "$NVCC_VER" | cut -d. -f2)
-            if [ "$MAJOR" -lt 12 ] || { [ "$MAJOR" -eq 12 ] && [ "$MINOR" -lt 8 ]; }; then
-              echo "::error::sm_${RAW} (Blackwell) requires CUDA >= 12.8 but found ${NVCC_VER}"
-              exit 1
-            fi
-          fi
+      - name: Set up CUDA + detect GPU
+        run: bash .github/scripts/cuda-setup.sh
 
       - uses: dtolnay/rust-toolchain@stable
 
@@ -69,12 +32,6 @@ jobs:
         with:
           key: gpu-ci
 
-      - name: Show GPU info
-        run: nvidia-smi
-
-      # INFERRS_WMMA_ARCH and CUDA_COMPUTE_CAP are set above from the
-      # runner's actual GPU so both WMMA and non-WMMA kernels compile to
-      # the correct SASS.
       - name: Build (with CUDA backend)
         run: cargo build -p inferrs -p inferrs-backend-cuda