From dfee794d87f7d9eb26f2d3a9a2caa99dd1f6fe57 Mon Sep 17 00:00:00 2001 From: Rob Taylor Date: Wed, 15 Apr 2026 21:14:33 +0100 Subject: [PATCH 1/2] ci: auto-detect GPU arch for WMMA kernels, add bench workflow gpu-ci.yml: detect the runner's compute capability via nvidia-smi and export INFERRS_WMMA_ARCH + CUDA_COMPUTE_CAP so candle-kernels/build.rs compiles WMMA SASS matching the actual GPU. The previous default (sm_80) only runs on Ampere; the RTX 5060 Ti is sm_120 (Blackwell). Adds an early check that nvcc >= 12.8 when targeting sm_120+. gpu-bench.yml: new workflow_dispatch workflow for running `inferrs bench` and optional nsys profiling on the self-hosted GPU runner. Produces timing stats and uploads nsys traces as artifacts. Co-developed-by: Claude Code v2.1.104 (claude-opus-4-6) --- .github/workflows/gpu-bench.yml | 114 ++++++++++++++++++++++++++++++++ .github/workflows/gpu-ci.yml | 30 +++++++-- 2 files changed, 140 insertions(+), 4 deletions(-) create mode 100644 .github/workflows/gpu-bench.yml diff --git a/.github/workflows/gpu-bench.yml b/.github/workflows/gpu-bench.yml new file mode 100644 index 00000000..08846069 --- /dev/null +++ b/.github/workflows/gpu-bench.yml @@ -0,0 +1,114 @@ +name: GPU Bench + +on: + # Manual trigger with configurable model + workflow_dispatch: + inputs: + model: + description: 'HuggingFace model ID or local path on runner' + required: false + default: '' + prompt_len: + description: 'Number of synthetic prompt tokens' + required: false + default: '128' + runs: + description: 'Number of timed benchmark runs' + required: false + default: '5' + profile: + description: 'Run nsys profiling (true/false)' + required: false + default: 'false' + +env: + CARGO_TERM_COLOR: always + +jobs: + bench: + name: Benchmark (nvidia-runner-1) + runs-on: [self-hosted, Linux, X64, cuda] + timeout-minutes: 120 + steps: + - uses: actions/checkout@v4 + + - name: Install system dependencies + run: | + sudo apt-get update -q + sudo apt-get install -y --no-install-recommends \ + pkg-config libssl-dev + + - name: Set up CUDA environment + run: | + for d in /usr/local/cuda /opt/cuda /usr; do + if [ -x "$d/bin/nvcc" ]; then + echo "Found CUDA at $d" + echo "$d/bin" >> "$GITHUB_PATH" + echo "CUDA_PATH=$d" >> "$GITHUB_ENV" + echo "LIBRARY_PATH=$d/lib64:${LIBRARY_PATH:-}" >> "$GITHUB_ENV" + echo "LD_LIBRARY_PATH=$d/lib64:${LD_LIBRARY_PATH:-}" >> "$GITHUB_ENV" + break + fi + done + + - name: Detect GPU compute capability + run: | + RAW=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -1 | tr -d '.') + echo "Detected GPU compute capability: sm_${RAW}" + echo "INFERRS_WMMA_ARCH=${RAW}" >> "$GITHUB_ENV" + echo "CUDA_COMPUTE_CAP=${RAW}" >> "$GITHUB_ENV" + + - uses: dtolnay/rust-toolchain@stable + + - uses: Swatinem/rust-cache@v2 + with: + key: gpu-bench + + - name: GPU info + run: | + nvidia-smi + nvidia-smi --query-gpu=name,compute_cap,memory.total,driver_version --format=csv + + - name: Build (release, CUDA backend) + run: cargo build --release -p inferrs -p inferrs-backend-cuda + + - name: Run inferrs bench + if: inputs.model != '' + run: | + cargo run --release -p inferrs -- bench \ + --model "${{ inputs.model }}" \ + --warmup 1 \ + --runs "${{ inputs.runs }}" \ + --prompt-len "${{ inputs.prompt_len }}" + + - name: Run nsys profile + if: inputs.profile == 'true' && inputs.model != '' + run: | + # Single-run nsys trace for kernel-level analysis + nsys profile \ + --output /tmp/inferrs-bench-trace \ + --force-overwrite true \ + --trace cuda,nvtx \ + --sample none \ + cargo run --release -p inferrs -- bench \ + --model "${{ inputs.model }}" \ + --warmup 0 \ + --runs 1 \ + --prompt-len "${{ inputs.prompt_len }}" + + - name: Upload nsys trace + if: inputs.profile == 'true' && inputs.model != '' + uses: actions/upload-artifact@v4 + with: + name: nsys-trace + path: /tmp/inferrs-bench-trace.* + retention-days: 14 + + - name: Kernel-level stats (nsys stats) + if: inputs.profile == 'true' && inputs.model != '' + run: | + nsys stats /tmp/inferrs-bench-trace.nsys-rep \ + --report gpukernsum \ + --format csv \ + --output /tmp/inferrs-kernel-stats || true + cat /tmp/inferrs-kernel-stats*.csv 2>/dev/null || echo "nsys stats not available" diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml index 6e563a12..ac303d29 100644 --- a/.github/workflows/gpu-ci.yml +++ b/.github/workflows/gpu-ci.yml @@ -40,6 +40,29 @@ jobs: done "${NVCC:-nvcc}" --version + - name: Detect GPU compute capability + run: | + # Query the GPU's compute capability and export it so + # candle-kernels/build.rs compiles WMMA kernels to matching SASS. + # Without this, DEFAULT_WMMA_ARCH=80 produces sm_80 SASS which + # only runs on Ampere — not on Blackwell (sm_120) or other arches. + RAW=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -1 | tr -d '.') + echo "Detected GPU compute capability: sm_${RAW}" + echo "INFERRS_WMMA_ARCH=${RAW}" >> "$GITHUB_ENV" + echo "CUDA_COMPUTE_CAP=${RAW}" >> "$GITHUB_ENV" + # Blackwell (sm_120) requires CUDA toolkit >= 12.8. Warn early + # if the toolkit is too old. + NVCC_VER=$(nvcc --version | grep -oP 'release \K[0-9]+\.[0-9]+') + echo "nvcc version: ${NVCC_VER}" + if [ "${RAW}" -ge 120 ]; then + MAJOR=$(echo "$NVCC_VER" | cut -d. -f1) + MINOR=$(echo "$NVCC_VER" | cut -d. -f2) + if [ "$MAJOR" -lt 12 ] || { [ "$MAJOR" -eq 12 ] && [ "$MINOR" -lt 8 ]; }; then + echo "::error::sm_${RAW} (Blackwell) requires CUDA >= 12.8 but found ${NVCC_VER}" + exit 1 + fi + fi + - uses: dtolnay/rust-toolchain@stable - uses: Swatinem/rust-cache@v2 @@ -49,10 +72,9 @@ jobs: - name: Show GPU info run: nvidia-smi - # Let bindgen_cuda auto-detect the GPU's compute capability via - # nvidia-smi. The WMMA .cu files now compile to no-op stubs on - # SM < 7.0 (via INFERRS_NO_WMMA), so this works on Pascal (6.1) - # as well as Volta+ GPUs. + # INFERRS_WMMA_ARCH and CUDA_COMPUTE_CAP are set above from the + # runner's actual GPU so both WMMA and non-WMMA kernels compile to + # the correct SASS. - name: Build (with CUDA backend) run: cargo build -p inferrs -p inferrs-backend-cuda From 840e9e53be029f11469af248d11df6f02dc02f96 Mon Sep 17 00:00:00 2001 From: Rob Taylor Date: Wed, 15 Apr 2026 23:29:33 +0100 Subject: [PATCH 2/2] refactor(ci): extract shared CUDA setup, fix review findings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Extract CUDA path detection, CC detection, and Blackwell nvcc version check into .github/scripts/cuda-setup.sh — eliminates duplication between gpu-ci.yml and gpu-bench.yml, and closes the gap where gpu-bench.yml was missing the Blackwell guard. - Fail fast when no CUDA installation is found (was silent fallback). - Replace grep -oP (Perl regex, non-portable) with grep -o + awk. - Make model input required in gpu-bench.yml (was silently skipping bench when empty default was used). - Use pre-built binary for nsys profiling (avoids redundant link). - Merge redundant nvidia-smi calls into the shared setup script. - Guard apt-get install with dpkg -s check for self-hosted runner. Co-developed-by: Claude Code v2.1.104 (claude-opus-4-6) --- .github/scripts/cuda-setup.sh | 56 +++++++++++++++++++++++++++++++++ .github/workflows/gpu-bench.yml | 45 ++++++-------------------- .github/workflows/gpu-ci.yml | 51 +++--------------------------- 3 files changed, 69 insertions(+), 83 deletions(-) create mode 100644 .github/scripts/cuda-setup.sh diff --git a/.github/scripts/cuda-setup.sh b/.github/scripts/cuda-setup.sh new file mode 100644 index 00000000..c1722f73 --- /dev/null +++ b/.github/scripts/cuda-setup.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash +# Shared CUDA environment setup for GPU CI and bench workflows. +# Finds the CUDA toolkit, detects the GPU's compute capability, and +# exports env vars that candle-kernels/build.rs needs to compile WMMA +# kernels to matching SASS. +# +# Exports (via $GITHUB_ENV / $GITHUB_PATH): +# PATH — prepends /bin +# CUDA_PATH — root of the CUDA installation +# LIBRARY_PATH — /lib64 +# LD_LIBRARY_PATH — /lib64 +# INFERRS_WMMA_ARCH — numeric compute cap (e.g. 120) +# CUDA_COMPUTE_CAP — same value, used by bindgen_cuda +set -euo pipefail + +# ---------- Find CUDA installation ---------- +CUDA_FOUND="" +for d in /usr/local/cuda /opt/cuda /usr; do + if [ -x "$d/bin/nvcc" ]; then + echo "Found CUDA at $d" + echo "$d/bin" >> "$GITHUB_PATH" + echo "CUDA_PATH=$d" >> "$GITHUB_ENV" + echo "LIBRARY_PATH=$d/lib64:${LIBRARY_PATH:-}" >> "$GITHUB_ENV" + echo "LD_LIBRARY_PATH=$d/lib64:${LD_LIBRARY_PATH:-}" >> "$GITHUB_ENV" + CUDA_FOUND="$d" + break + fi +done +if [ -z "$CUDA_FOUND" ]; then + echo "::error::No CUDA installation found (checked /usr/local/cuda, /opt/cuda, /usr)" + exit 1 +fi +"$CUDA_FOUND/bin/nvcc" --version + +# ---------- Detect GPU compute capability ---------- +RAW=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -1 | tr -d '.') +echo "Detected GPU compute capability: sm_${RAW}" +echo "INFERRS_WMMA_ARCH=${RAW}" >> "$GITHUB_ENV" +echo "CUDA_COMPUTE_CAP=${RAW}" >> "$GITHUB_ENV" + +# ---------- GPU info ---------- +nvidia-smi +nvidia-smi --query-gpu=name,compute_cap,memory.total,driver_version --format=csv + +# ---------- Blackwell nvcc version check ---------- +# sm_120+ requires CUDA toolkit >= 12.8. +NVCC_VER=$("$CUDA_FOUND/bin/nvcc" --version | grep -o 'release [0-9]*\.[0-9]*' | awk '{print $2}') +echo "nvcc version: ${NVCC_VER}" +if [ "${RAW}" -ge 120 ]; then + MAJOR=$(echo "$NVCC_VER" | cut -d. -f1) + MINOR=$(echo "$NVCC_VER" | cut -d. -f2) + if [ "$MAJOR" -lt 12 ] || { [ "$MAJOR" -eq 12 ] && [ "$MINOR" -lt 8 ]; }; then + echo "::error::sm_${RAW} (Blackwell) requires CUDA >= 12.8 but found ${NVCC_VER}" + exit 1 + fi +fi diff --git a/.github/workflows/gpu-bench.yml b/.github/workflows/gpu-bench.yml index 08846069..68a83475 100644 --- a/.github/workflows/gpu-bench.yml +++ b/.github/workflows/gpu-bench.yml @@ -1,13 +1,11 @@ name: GPU Bench on: - # Manual trigger with configurable model workflow_dispatch: inputs: model: description: 'HuggingFace model ID or local path on runner' - required: false - default: '' + required: true prompt_len: description: 'Number of synthetic prompt tokens' required: false @@ -34,29 +32,11 @@ jobs: - name: Install system dependencies run: | - sudo apt-get update -q - sudo apt-get install -y --no-install-recommends \ - pkg-config libssl-dev - - - name: Set up CUDA environment - run: | - for d in /usr/local/cuda /opt/cuda /usr; do - if [ -x "$d/bin/nvcc" ]; then - echo "Found CUDA at $d" - echo "$d/bin" >> "$GITHUB_PATH" - echo "CUDA_PATH=$d" >> "$GITHUB_ENV" - echo "LIBRARY_PATH=$d/lib64:${LIBRARY_PATH:-}" >> "$GITHUB_ENV" - echo "LD_LIBRARY_PATH=$d/lib64:${LD_LIBRARY_PATH:-}" >> "$GITHUB_ENV" - break - fi - done + dpkg -s pkg-config libssl-dev &>/dev/null || \ + { sudo apt-get update -q && sudo apt-get install -y --no-install-recommends pkg-config libssl-dev; } - - name: Detect GPU compute capability - run: | - RAW=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -1 | tr -d '.') - echo "Detected GPU compute capability: sm_${RAW}" - echo "INFERRS_WMMA_ARCH=${RAW}" >> "$GITHUB_ENV" - echo "CUDA_COMPUTE_CAP=${RAW}" >> "$GITHUB_ENV" + - name: Set up CUDA + detect GPU + run: bash .github/scripts/cuda-setup.sh - uses: dtolnay/rust-toolchain@stable @@ -64,16 +44,10 @@ jobs: with: key: gpu-bench - - name: GPU info - run: | - nvidia-smi - nvidia-smi --query-gpu=name,compute_cap,memory.total,driver_version --format=csv - - name: Build (release, CUDA backend) run: cargo build --release -p inferrs -p inferrs-backend-cuda - name: Run inferrs bench - if: inputs.model != '' run: | cargo run --release -p inferrs -- bench \ --model "${{ inputs.model }}" \ @@ -82,22 +56,21 @@ jobs: --prompt-len "${{ inputs.prompt_len }}" - name: Run nsys profile - if: inputs.profile == 'true' && inputs.model != '' + if: inputs.profile == 'true' run: | - # Single-run nsys trace for kernel-level analysis nsys profile \ --output /tmp/inferrs-bench-trace \ --force-overwrite true \ --trace cuda,nvtx \ --sample none \ - cargo run --release -p inferrs -- bench \ + ./target/release/inferrs bench \ --model "${{ inputs.model }}" \ --warmup 0 \ --runs 1 \ --prompt-len "${{ inputs.prompt_len }}" - name: Upload nsys trace - if: inputs.profile == 'true' && inputs.model != '' + if: inputs.profile == 'true' uses: actions/upload-artifact@v4 with: name: nsys-trace @@ -105,7 +78,7 @@ jobs: retention-days: 14 - name: Kernel-level stats (nsys stats) - if: inputs.profile == 'true' && inputs.model != '' + if: inputs.profile == 'true' run: | nsys stats /tmp/inferrs-bench-trace.nsys-rep \ --report gpukernsum \ diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml index ac303d29..de896a8b 100644 --- a/.github/workflows/gpu-ci.yml +++ b/.github/workflows/gpu-ci.yml @@ -20,48 +20,11 @@ jobs: - name: Install system dependencies run: | - sudo apt-get update -q - sudo apt-get install -y --no-install-recommends \ - pkg-config libssl-dev + dpkg -s pkg-config libssl-dev &>/dev/null || \ + { sudo apt-get update -q && sudo apt-get install -y --no-install-recommends pkg-config libssl-dev; } - - name: Set up CUDA environment - run: | - # Find the CUDA installation (common paths on self-hosted runners) - for d in /usr/local/cuda /opt/cuda /usr; do - if [ -x "$d/bin/nvcc" ]; then - echo "Found CUDA at $d" - echo "$d/bin" >> "$GITHUB_PATH" - echo "CUDA_PATH=$d" >> "$GITHUB_ENV" - echo "LIBRARY_PATH=$d/lib64:${LIBRARY_PATH:-}" >> "$GITHUB_ENV" - echo "LD_LIBRARY_PATH=$d/lib64:${LD_LIBRARY_PATH:-}" >> "$GITHUB_ENV" - NVCC="$d/bin/nvcc" - break - fi - done - "${NVCC:-nvcc}" --version - - - name: Detect GPU compute capability - run: | - # Query the GPU's compute capability and export it so - # candle-kernels/build.rs compiles WMMA kernels to matching SASS. - # Without this, DEFAULT_WMMA_ARCH=80 produces sm_80 SASS which - # only runs on Ampere — not on Blackwell (sm_120) or other arches. - RAW=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -1 | tr -d '.') - echo "Detected GPU compute capability: sm_${RAW}" - echo "INFERRS_WMMA_ARCH=${RAW}" >> "$GITHUB_ENV" - echo "CUDA_COMPUTE_CAP=${RAW}" >> "$GITHUB_ENV" - # Blackwell (sm_120) requires CUDA toolkit >= 12.8. Warn early - # if the toolkit is too old. - NVCC_VER=$(nvcc --version | grep -oP 'release \K[0-9]+\.[0-9]+') - echo "nvcc version: ${NVCC_VER}" - if [ "${RAW}" -ge 120 ]; then - MAJOR=$(echo "$NVCC_VER" | cut -d. -f1) - MINOR=$(echo "$NVCC_VER" | cut -d. -f2) - if [ "$MAJOR" -lt 12 ] || { [ "$MAJOR" -eq 12 ] && [ "$MINOR" -lt 8 ]; }; then - echo "::error::sm_${RAW} (Blackwell) requires CUDA >= 12.8 but found ${NVCC_VER}" - exit 1 - fi - fi + - name: Set up CUDA + detect GPU + run: bash .github/scripts/cuda-setup.sh - uses: dtolnay/rust-toolchain@stable @@ -69,12 +32,6 @@ jobs: with: key: gpu-ci - - name: Show GPU info - run: nvidia-smi - - # INFERRS_WMMA_ARCH and CUDA_COMPUTE_CAP are set above from the - # runner's actual GPU so both WMMA and non-WMMA kernels compile to - # the correct SASS. - name: Build (with CUDA backend) run: cargo build -p inferrs -p inferrs-backend-cuda