From dd1d58d2b759fbb3cf49868d642b0c9ab4895170 Mon Sep 17 00:00:00 2001
From: Jordan Nanos <jordan.nanos@gmail.com>
Date: Tue, 14 Apr 2026 14:38:47 -0700
Subject: [PATCH 1/8] Add vLLM dynamic scheduler reconfigure hook

---
 benchmarks/benchmark_lib.sh                | 35 ++++++++
 docs/vllm-dynamic-scheduler-reconfigure.md | 95 ++++++++++++++++++++++
 2 files changed, 130 insertions(+)
 create mode 100644 docs/vllm-dynamic-scheduler-reconfigure.md

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 535313252..d56da14aa 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -9,6 +9,37 @@ export PYTHONDONTWRITEBYTECODE=1
 export PYTHONPYCACHEPREFIX="${PYTHONPYCACHEPREFIX:-/tmp/inferencex-pycache}"
 mkdir -p "$PYTHONPYCACHEPREFIX" 2>/dev/null || true
 
+# --------------------------------
+# vLLM dynamic scheduler reconfiguration
+# --------------------------------
+
+# Reconfigure vLLM scheduler limits on a running endpoint. This requires a vLLM
+# build that exposes POST /pause, POST /reconfigure_scheduler, and POST /resume.
+# The feature is opt-in via VLLM_DYNAMIC_RECONFIGURE=1 and is intended for
+# single-server sweeps where model, parallelism, and cache layout stay fixed.
+reconfigure_vllm_scheduler() {
+    local port="$1"
+    local base_url="${VLLM_DYNAMIC_RECONFIGURE_BASE_URL:-http://0.0.0.0:$port}"
+    local params=()
+
+    [[ -n "${VLLM_MAX_NUM_BATCHED_TOKENS:-}" ]] && \
+        params+=(--data-urlencode "max_num_batched_tokens=$VLLM_MAX_NUM_BATCHED_TOKENS")
+    [[ -n "${VLLM_MAX_NUM_SEQS:-}" ]] && \
+        params+=(--data-urlencode "max_num_seqs=$VLLM_MAX_NUM_SEQS")
+    [[ -n "${VLLM_MAX_NUM_SCHEDULED_TOKENS:-}" ]] && \
+        params+=(--data-urlencode "max_num_scheduled_tokens=$VLLM_MAX_NUM_SCHEDULED_TOKENS")
+
+    if [[ ${#params[@]} -eq 0 ]]; then
+        echo "VLLM_DYNAMIC_RECONFIGURE=1 but no VLLM scheduler parameters were set"
+        return 1
+    fi
+
+    echo "Reconfiguring vLLM scheduler at $base_url"
+    curl -fsS -X POST "$base_url/pause?mode=keep"
+    curl -fsS -X POST -G "$base_url/reconfigure_scheduler" "${params[@]}"
+    curl -fsS -X POST "$base_url/resume"
+}
+
 # --------------------------------
 # GPU monitoring helpers
 # --------------------------------
@@ -326,6 +357,10 @@ run_benchmark_serving() {
         num_prompts="$max_concurrency"
     fi
 
+    if [[ "${VLLM_DYNAMIC_RECONFIGURE:-0}" == "1" && "$backend" == "vllm" ]]; then
+        reconfigure_vllm_scheduler "$port"
+    fi
+
     # Build benchmark command
     local benchmark_cmd=(
         python3 "$workspace_dir/utils/bench_serving/benchmark_serving.py"
diff --git a/docs/vllm-dynamic-scheduler-reconfigure.md b/docs/vllm-dynamic-scheduler-reconfigure.md
new file mode 100644
index 000000000..04062037b
--- /dev/null
+++ b/docs/vllm-dynamic-scheduler-reconfigure.md
@@ -0,0 +1,95 @@
+# vLLM Dynamic Scheduler Reconfiguration
+
+InferenceX can optionally reconfigure selected vLLM scheduler limits between
+benchmark runs without restarting the serving endpoint. This is useful for
+single-server sweeps where the model, parallelism, quantization, max context,
+and KV cache layout stay fixed, but each benchmark case uses different scheduler
+admission limits.
+
+## Requirements
+
+This feature requires a vLLM build that exposes these HTTP endpoints:
+
+- `POST /pause?mode=keep`
+- `POST /reconfigure_scheduler`
+- `POST /resume`
+
+The stock vLLM releases do not provide `/reconfigure_scheduler` unless the
+runtime scheduler reconfiguration patch has been included in the installed vLLM
+package or container image.
+
+## Enabling
+
+Set `VLLM_DYNAMIC_RECONFIGURE=1` before calling `run_benchmark_serving` with
+`--backend vllm`.
+
+Supported environment variables:
+
+```bash
+export VLLM_DYNAMIC_RECONFIGURE=1
+export VLLM_MAX_NUM_BATCHED_TOKENS=32768
+export VLLM_MAX_NUM_SEQS=128
+export VLLM_MAX_NUM_SCHEDULED_TOKENS=32768
+```
+
+`run_benchmark_serving` calls the reconfiguration helper before each benchmark
+run. The helper pauses vLLM, applies the requested scheduler limits, and resumes
+serving.
+
+## Example Sweep
+
+Launch vLLM once with the largest static capacity needed by the sweep, then vary
+scheduler limits between benchmark cases:
+
+```bash
+vllm serve "$MODEL" \
+  --host 0.0.0.0 \
+  --port "$PORT" \
+  --tensor-parallel-size "$TP" \
+  --max-num-seqs 256 \
+  --max-num-batched-tokens 32768 \
+  > "$SERVER_LOG" 2>&1 &
+
+SERVER_PID=$!
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+for conc in 1 2 4 8 16 32 64 128; do
+    export VLLM_DYNAMIC_RECONFIGURE=1
+    export VLLM_MAX_NUM_SEQS="$conc"
+    export VLLM_MAX_NUM_BATCHED_TOKENS=32768
+    export VLLM_MAX_NUM_SCHEDULED_TOKENS=32768
+
+    run_benchmark_serving \
+      --model "$MODEL" \
+      --port "$PORT" \
+      --backend vllm \
+      --input-len "$ISL" \
+      --output-len "$OSL" \
+      --random-range-ratio "$RANDOM_RANGE_RATIO" \
+      --num-prompts "$((conc * 10))" \
+      --max-concurrency "$conc" \
+      --result-filename "${RESULT_FILENAME}_conc${conc}" \
+      --result-dir /workspace/ \
+      --server-pid "$SERVER_PID"
+done
+```
+
+## Distribution of the vLLM Patch
+
+Cluster runs must use a vLLM package or image that includes the dynamic scheduler
+API. Practical options are:
+
+1. Build a custom benchmark container from the vLLM branch that contains the API.
+2. Install the patched vLLM wheel in the InferenceX job before starting `vllm serve`.
+3. Mount a patched vLLM checkout and install it editable in the benchmark image.
+
+For reproducible cluster results, prefer a custom container or pinned wheel and
+record the vLLM commit SHA in the benchmark metadata.
+
+## Safety Notes
+
+Do not use this mechanism to change startup-time engine settings such as model,
+quantization, tensor/data/expert parallelism, KV cache dtype, block size,
+`gpu_memory_utilization`, or `max_model_len`. Launch vLLM with the largest static
+capacity required by the sweep and use dynamic reconfiguration only for scheduler
+limits.

From 51f3290d7cfe3516e59c58dbc3db858d7b6b8f28 Mon Sep 17 00:00:00 2001
From: Jordan Nanos <jordan.nanos@gmail.com>
Date: Tue, 14 Apr 2026 14:40:13 -0700
Subject: [PATCH 2/8] Fix reconfigure helper to match vLLM /reconfigure API and
 add A/B test script

- Fix reconfigure_vllm_scheduler() to use POST /reconfigure with a JSON
  body instead of query params to the non-existent /reconfigure_scheduler
- Remove max_num_scheduled_tokens (internal name, not exposed by API)
- Use mode=abort&clear_cache=true on /pause for clean reconfigure cycles
- Add benchmarks/test_reconfigure_sweep.sh for standalone A/B testing on
  a cluster: runs N cold starts (baseline) vs 1 start + N reconfigure
  cycles and prints wall-clock comparison
- Update docs to match actual API surface

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 benchmarks/benchmark_lib.sh                |  35 ++--
 benchmarks/test_reconfigure_sweep.sh       | 205 +++++++++++++++++++++
 docs/vllm-dynamic-scheduler-reconfigure.md | 108 +++++------
 3 files changed, 282 insertions(+), 66 deletions(-)
 create mode 100755 benchmarks/test_reconfigure_sweep.sh

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index d56da14aa..5754b87f1 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -14,29 +14,40 @@ mkdir -p "$PYTHONPYCACHEPREFIX" 2>/dev/null || true
 # --------------------------------
 
 # Reconfigure vLLM scheduler limits on a running endpoint. This requires a vLLM
-# build that exposes POST /pause, POST /reconfigure_scheduler, and POST /resume.
+# build that exposes POST /pause, POST /reconfigure, and POST /resume.
 # The feature is opt-in via VLLM_DYNAMIC_RECONFIGURE=1 and is intended for
 # single-server sweeps where model, parallelism, and cache layout stay fixed.
+#
+# Supported env vars (set before calling):
+#   VLLM_MAX_NUM_BATCHED_TOKENS  -- max tokens scheduled per step
+#   VLLM_MAX_NUM_SEQS            -- max concurrent sequences
 reconfigure_vllm_scheduler() {
     local port="$1"
     local base_url="${VLLM_DYNAMIC_RECONFIGURE_BASE_URL:-http://0.0.0.0:$port}"
-    local params=()
 
-    [[ -n "${VLLM_MAX_NUM_BATCHED_TOKENS:-}" ]] && \
-        params+=(--data-urlencode "max_num_batched_tokens=$VLLM_MAX_NUM_BATCHED_TOKENS")
-    [[ -n "${VLLM_MAX_NUM_SEQS:-}" ]] && \
-        params+=(--data-urlencode "max_num_seqs=$VLLM_MAX_NUM_SEQS")
-    [[ -n "${VLLM_MAX_NUM_SCHEDULED_TOKENS:-}" ]] && \
-        params+=(--data-urlencode "max_num_scheduled_tokens=$VLLM_MAX_NUM_SCHEDULED_TOKENS")
+    # Build JSON body from set env vars
+    local json="{"
+    local sep=""
+    if [[ -n "${VLLM_MAX_NUM_BATCHED_TOKENS:-}" ]]; then
+        json+="${sep}\"max_num_batched_tokens\":${VLLM_MAX_NUM_BATCHED_TOKENS}"
+        sep=","
+    fi
+    if [[ -n "${VLLM_MAX_NUM_SEQS:-}" ]]; then
+        json+="${sep}\"max_num_seqs\":${VLLM_MAX_NUM_SEQS}"
+        sep=","
+    fi
+    json+="}"
 
-    if [[ ${#params[@]} -eq 0 ]]; then
+    if [[ "$json" == "{}" ]]; then
         echo "VLLM_DYNAMIC_RECONFIGURE=1 but no VLLM scheduler parameters were set"
         return 1
     fi
 
-    echo "Reconfiguring vLLM scheduler at $base_url"
-    curl -fsS -X POST "$base_url/pause?mode=keep"
-    curl -fsS -X POST -G "$base_url/reconfigure_scheduler" "${params[@]}"
+    echo "Reconfiguring vLLM scheduler at $base_url: $json"
+    curl -fsS -X POST "$base_url/pause?mode=abort&clear_cache=true"
+    curl -fsS -X POST "$base_url/reconfigure" \
+        -H "Content-Type: application/json" \
+        -d "$json"
     curl -fsS -X POST "$base_url/resume"
 }
 
diff --git a/benchmarks/test_reconfigure_sweep.sh b/benchmarks/test_reconfigure_sweep.sh
new file mode 100755
index 000000000..815db2355
--- /dev/null
+++ b/benchmarks/test_reconfigure_sweep.sh
@@ -0,0 +1,205 @@
+#!/usr/bin/env bash
+# A/B test: compare N cold starts (baseline) vs 1 start + N reconfigure cycles.
+#
+# Run this directly on a GPU node inside a vLLM container that includes the
+# /reconfigure endpoint. Mount the inferencex workspace at /workspace.
+#
+# Required env vars:
+#   MODEL   -- HuggingFace model path  (e.g. openai/gpt-oss-120b)
+#   TP      -- tensor-parallel size    (e.g. 8)
+#   CONC    -- benchmark concurrency   (e.g. 32)
+#   ISL     -- input sequence length   (e.g. 1024)
+#   OSL     -- output sequence length  (e.g. 1024)
+#
+# Optional:
+#   PORT              -- server port (default 8888)
+#   VLLM_EXTRA_ARGS   -- extra args passed to vllm serve (e.g. "--kv-cache-dtype fp8")
+#   SKIP_BASELINE     -- set to 1 to skip the baseline phase
+#   SKIP_RECONFIG     -- set to 1 to skip the reconfigure phase
+#
+# Usage:
+#   export MODEL=openai/gpt-oss-120b TP=8 CONC=32 ISL=1024 OSL=1024
+#   bash benchmarks/test_reconfigure_sweep.sh
+set -euo pipefail
+
+source "$(dirname "$0")/benchmark_lib.sh"
+
+check_env_vars MODEL TP CONC ISL OSL
+
+PORT=${PORT:-8888}
+MAX_MODEL_LEN=$(( ISL + OSL + 256 ))
+NUM_PROMPTS=$(( CONC * 10 ))
+
+# Parameter grid to sweep
+MNB_VALUES=(4096 8192 16384)
+MNS_VALUES=(256 512)
+GRID_SIZE=$(( ${#MNB_VALUES[@]} * ${#MNS_VALUES[@]} ))
+
+RESULTS_BASE=/workspace/results_reconfigure_test
+RESULTS_A="${RESULTS_BASE}/baseline"
+RESULTS_B="${RESULTS_BASE}/reconfig"
+mkdir -p "$RESULTS_A" "$RESULTS_B"
+
+SERVER_LOG_DIR="${RESULTS_BASE}/logs"
+mkdir -p "$SERVER_LOG_DIR"
+
+start_server() {
+    local mnb="$1" mns="$2" log="$3"
+
+    vllm serve "$MODEL" \
+        --host 0.0.0.0 --port "$PORT" \
+        --tensor-parallel-size "$TP" \
+        --gpu-memory-utilization 0.9 \
+        --max-model-len "$MAX_MODEL_LEN" \
+        --max-num-batched-tokens "$mnb" \
+        --max-num-seqs "$mns" \
+        --no-enable-prefix-caching \
+        --disable-log-requests \
+        ${VLLM_EXTRA_ARGS:-} \
+        > "$log" 2>&1 &
+    SERVER_PID=$!
+
+    wait_for_server_ready \
+        --port "$PORT" \
+        --server-log "$log" \
+        --server-pid "$SERVER_PID"
+}
+
+kill_server() {
+    if [[ -n "${SERVER_PID:-}" ]]; then
+        kill "$SERVER_PID" 2>/dev/null || true
+        wait "$SERVER_PID" 2>/dev/null || true
+        unset SERVER_PID
+        sleep 2
+    fi
+}
+
+run_bench() {
+    local result_dir="$1" result_name="$2"
+
+    run_benchmark_serving \
+        --model "$MODEL" \
+        --port "$PORT" \
+        --backend vllm \
+        --input-len "$ISL" \
+        --output-len "$OSL" \
+        --random-range-ratio 0.8 \
+        --num-prompts "$NUM_PROMPTS" \
+        --max-concurrency "$CONC" \
+        --result-filename "$result_name" \
+        --result-dir "$result_dir" \
+        --server-pid "$SERVER_PID"
+}
+
+trap kill_server EXIT
+
+pip install -q datasets pandas 2>/dev/null || true
+
+# ──────────────────────────────────────────────
+# Phase A: Baseline — separate server per config
+# ──────────────────────────────────────────────
+if [[ "${SKIP_BASELINE:-0}" != "1" ]]; then
+    echo ""
+    echo "###############################################"
+    echo "# Phase A: Baseline (${GRID_SIZE} cold starts)"
+    echo "###############################################"
+
+    A_START=$(date +%s)
+    A_RUN=0
+
+    for mnb in "${MNB_VALUES[@]}"; do
+      for mns in "${MNS_VALUES[@]}"; do
+        A_RUN=$((A_RUN + 1))
+        echo ""
+        echo "--- A.$A_RUN: max_num_batched_tokens=$mnb max_num_seqs=$mns ---"
+
+        RUN_START=$(date +%s)
+        start_server "$mnb" "$mns" "${SERVER_LOG_DIR}/server_a_${A_RUN}.log"
+        READY_TIME=$(date +%s)
+        echo "  Startup: $((READY_TIME - RUN_START))s"
+
+        run_bench "$RESULTS_A" "baseline_mnb${mnb}_mns${mns}"
+
+        kill_server
+        RUN_END=$(date +%s)
+        echo "  Total: $((RUN_END - RUN_START))s"
+      done
+    done
+
+    A_END=$(date +%s)
+    A_TOTAL=$((A_END - A_START))
+    echo ""
+    echo "Phase A total: ${A_TOTAL}s"
+else
+    A_TOTAL="(skipped)"
+fi
+
+# ──────────────────────────────────────────────
+# Phase B: Reconfigure — single server, N cycles
+# ──────────────────────────────────────────────
+if [[ "${SKIP_RECONFIG:-0}" != "1" ]]; then
+    echo ""
+    echo "###############################################"
+    echo "# Phase B: Reconfigure (1 cold start)"
+    echo "###############################################"
+
+    B_START=$(date +%s)
+
+    # Start with the largest values so CUDA graphs and KV cache cover all configs
+    INIT_MNB=${MNB_VALUES[-1]}
+    INIT_MNS=${MNS_VALUES[-1]}
+
+    echo ""
+    echo "--- Starting server (mnb=$INIT_MNB mns=$INIT_MNS) ---"
+    STARTUP_START=$(date +%s)
+    start_server "$INIT_MNB" "$INIT_MNS" "${SERVER_LOG_DIR}/server_b.log"
+    STARTUP_END=$(date +%s)
+    echo "  Startup: $((STARTUP_END - STARTUP_START))s"
+
+    B_RUN=0
+    for mnb in "${MNB_VALUES[@]}"; do
+      for mns in "${MNS_VALUES[@]}"; do
+        B_RUN=$((B_RUN + 1))
+        echo ""
+        echo "--- B.$B_RUN: max_num_batched_tokens=$mnb max_num_seqs=$mns ---"
+
+        RECONF_START=$(date +%s)
+
+        export VLLM_DYNAMIC_RECONFIGURE=1
+        export VLLM_MAX_NUM_BATCHED_TOKENS="$mnb"
+        export VLLM_MAX_NUM_SEQS="$mns"
+        reconfigure_vllm_scheduler "$PORT"
+
+        RECONF_END=$(date +%s)
+        echo "  Reconfigure: $((RECONF_END - RECONF_START))s"
+
+        run_bench "$RESULTS_B" "reconfig_mnb${mnb}_mns${mns}"
+
+        RUN_END=$(date +%s)
+        echo "  Total: $((RUN_END - RECONF_START))s"
+      done
+    done
+
+    kill_server
+
+    B_END=$(date +%s)
+    B_TOTAL=$((B_END - B_START))
+    B_STARTUP=$((STARTUP_END - STARTUP_START))
+    echo ""
+    echo "Phase B total: ${B_TOTAL}s (startup: ${B_STARTUP}s)"
+else
+    B_TOTAL="(skipped)"
+fi
+
+# ──────────────────────────────────────────────
+# Summary
+# ──────────────────────────────────────────────
+echo ""
+echo "=============================================="
+echo " Comparison"
+echo "=============================================="
+echo " Phase A (baseline, ${GRID_SIZE} cold starts):  ${A_TOTAL}s"
+echo " Phase B (reconfigure, 1 cold start):   ${B_TOTAL}s"
+echo ""
+echo " Results saved to: ${RESULTS_BASE}/"
+echo "=============================================="
diff --git a/docs/vllm-dynamic-scheduler-reconfigure.md b/docs/vllm-dynamic-scheduler-reconfigure.md
index 04062037b..7db5c7b07 100644
--- a/docs/vllm-dynamic-scheduler-reconfigure.md
+++ b/docs/vllm-dynamic-scheduler-reconfigure.md
@@ -10,81 +10,81 @@ admission limits.
 
 This feature requires a vLLM build that exposes these HTTP endpoints:
 
-- `POST /pause?mode=keep`
-- `POST /reconfigure_scheduler`
+- `POST /pause?mode=abort&clear_cache=true`
+- `POST /reconfigure` (JSON body)
 - `POST /resume`
 
-The stock vLLM releases do not provide `/reconfigure_scheduler` unless the
-runtime scheduler reconfiguration patch has been included in the installed vLLM
-package or container image.
+The stock vLLM releases do not provide `/reconfigure`. You need either:
 
-## Enabling
+1. A Docker image built from the vLLM branch containing the reconfigure API
+   (see [Building the patched image](#building-the-patched-image) below).
+2. A runtime patch applied at container start from a mounted vLLM checkout.
+
+## Reconfigurable Parameters
+
+| Parameter | Type | Description |
+|-----------|------|-------------|
+| `max_num_batched_tokens` | int > 0 | Max tokens scheduled per step |
+| `max_num_seqs` | int > 0 | Max concurrent sequences |
+| `enable_chunked_prefill` | bool | Toggle chunked prefill |
+| `long_prefill_token_threshold` | int >= 0 | Cap prefill chunk size (0 = no cap) |
 
-Set `VLLM_DYNAMIC_RECONFIGURE=1` before calling `run_benchmark_serving` with
-`--backend vllm`.
+Everything else (TP, EP, GPU memory, KV cache dtype, block size, CUDA graphs,
+compilation config, etc.) is baked in at startup and cannot be changed.
 
-Supported environment variables:
+## Enabling
+
+Set `VLLM_DYNAMIC_RECONFIGURE=1` and the desired parameter env vars before
+calling `run_benchmark_serving` with `--backend vllm`:
 
 ```bash
 export VLLM_DYNAMIC_RECONFIGURE=1
 export VLLM_MAX_NUM_BATCHED_TOKENS=32768
 export VLLM_MAX_NUM_SEQS=128
-export VLLM_MAX_NUM_SCHEDULED_TOKENS=32768
 ```
 
-`run_benchmark_serving` calls the reconfiguration helper before each benchmark
-run. The helper pauses vLLM, applies the requested scheduler limits, and resumes
-serving.
+`run_benchmark_serving` calls `reconfigure_vllm_scheduler` before each benchmark
+run. The helper pauses vLLM, sends a JSON body to `/reconfigure`, and resumes.
+
+## A/B Test Script
 
-## Example Sweep
+`benchmarks/test_reconfigure_sweep.sh` runs back-to-back comparisons:
 
-Launch vLLM once with the largest static capacity needed by the sweep, then vary
-scheduler limits between benchmark cases:
+- **Phase A (baseline):** N cold starts, one per parameter combo
+- **Phase B (reconfigure):** 1 cold start, N reconfigure cycles
 
 ```bash
-vllm serve "$MODEL" \
-  --host 0.0.0.0 \
-  --port "$PORT" \
-  --tensor-parallel-size "$TP" \
-  --max-num-seqs 256 \
-  --max-num-batched-tokens 32768 \
-  > "$SERVER_LOG" 2>&1 &
-
-SERVER_PID=$!
-wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
-
-for conc in 1 2 4 8 16 32 64 128; do
-    export VLLM_DYNAMIC_RECONFIGURE=1
-    export VLLM_MAX_NUM_SEQS="$conc"
-    export VLLM_MAX_NUM_BATCHED_TOKENS=32768
-    export VLLM_MAX_NUM_SCHEDULED_TOKENS=32768
-
-    run_benchmark_serving \
-      --model "$MODEL" \
-      --port "$PORT" \
-      --backend vllm \
-      --input-len "$ISL" \
-      --output-len "$OSL" \
-      --random-range-ratio "$RANDOM_RANGE_RATIO" \
-      --num-prompts "$((conc * 10))" \
-      --max-concurrency "$conc" \
-      --result-filename "${RESULT_FILENAME}_conc${conc}" \
-      --result-dir /workspace/ \
-      --server-pid "$SERVER_PID"
-done
+export MODEL=openai/gpt-oss-120b TP=8 CONC=32 ISL=1024 OSL=1024
+bash benchmarks/test_reconfigure_sweep.sh
 ```
 
-## Distribution of the vLLM Patch
+## Building the Patched Image
 
-Cluster runs must use a vLLM package or image that includes the dynamic scheduler
-API. Practical options are:
+The changes are pure Python -- no C++/CUDA recompilation needed. Overlay them
+onto the stock vLLM image:
 
-1. Build a custom benchmark container from the vLLM branch that contains the API.
-2. Install the patched vLLM wheel in the InferenceX job before starting `vllm serve`.
-3. Mount a patched vLLM checkout and install it editable in the benchmark image.
+```bash
+# From the vllm repo root (on the branch with the reconfigure patch)
+docker build -f docker/Dockerfile.reconfigure-overlay \
+  -t ghcr.io/semianalysisai/vllm-reconfigure:test1 .
+
+docker push ghcr.io/semianalysisai/vllm-reconfigure:test1
+```
 
-For reproducible cluster results, prefer a custom container or pinned wheel and
-record the vLLM commit SHA in the benchmark metadata.
+Or patch at runtime by mounting the vLLM checkout and running the overlay script
+at the top of the benchmark:
+
+```bash
+docker run --gpus all --rm -it --network host --shm-size 64g \
+  -v /path/to/vllm:/workspace/vllm-patch:ro \
+  -v /path/to/inferencex:/workspace \
+  vllm/vllm-openai:v0.15.1 \
+  bash -c '
+    bash /workspace/vllm-patch/docker/apply-reconfigure-overlay.sh
+    export MODEL=openai/gpt-oss-120b TP=8 CONC=32 ISL=1024 OSL=1024
+    bash /workspace/benchmarks/test_reconfigure_sweep.sh
+  '
+```
 
 ## Safety Notes
 

From a1952975b54bc9a7121be3bc62d5f3eeef0cdae8 Mon Sep 17 00:00:00 2001
From: Jordan Nanos <jordan.nanos@gmail.com>
Date: Tue, 14 Apr 2026 14:41:15 -0700
Subject: [PATCH 3/8] Document patched vLLM distribution options

---
 benchmarks/benchmark_lib.sh       | 78 +++++++++++++++++++++++++
 docs/vllm-patched-distribution.md | 94 +++++++++++++++++++++++++++++++
 2 files changed, 172 insertions(+)
 create mode 100644 docs/vllm-patched-distribution.md

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 5754b87f1..e138a93eb 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -851,3 +851,81 @@ run_eval() {
     fi
     return $eval_rc
 }
+
+# --------------------------------
+# Patched vLLM distribution helpers
+# --------------------------------
+
+# Install a patched vLLM build before launching `vllm serve`. This is optional
+# and only needed when the active container/image does not already include the
+# dynamic scheduler reconfiguration API.
+#
+# Supported env vars:
+#   VLLM_PATCHED_WHEEL        -- local or remote wheel path/URL
+#   VLLM_PATCHED_REPO         -- git repository URL for patched vLLM
+#   VLLM_PATCHED_REF          -- branch, tag, or commit for VLLM_PATCHED_REPO
+#   VLLM_PATCHED_CHECKOUT     -- existing mounted checkout to install editable
+#   VLLM_PATCHED_INSTALL_MODE -- wheel, git, editable, or auto (default)
+install_patched_vllm() {
+    local mode="${VLLM_PATCHED_INSTALL_MODE:-auto}"
+
+    if [[ "$mode" == "auto" ]]; then
+        if [[ -n "${VLLM_PATCHED_WHEEL:-}" ]]; then
+            mode="wheel"
+        elif [[ -n "${VLLM_PATCHED_CHECKOUT:-}" ]]; then
+            mode="editable"
+        elif [[ -n "${VLLM_PATCHED_REPO:-}" ]]; then
+            mode="git"
+        else
+            echo "No patched vLLM install source configured; using existing vLLM"
+            return 0
+        fi
+    fi
+
+    case "$mode" in
+        wheel)
+            if [[ -z "${VLLM_PATCHED_WHEEL:-}" ]]; then
+                echo "VLLM_PATCHED_INSTALL_MODE=wheel requires VLLM_PATCHED_WHEEL"
+                return 1
+            fi
+            echo "Installing patched vLLM wheel: $VLLM_PATCHED_WHEEL"
+            python3 -m pip install --no-cache-dir --force-reinstall "$VLLM_PATCHED_WHEEL"
+            ;;
+        git)
+            if [[ -z "${VLLM_PATCHED_REPO:-}" || -z "${VLLM_PATCHED_REF:-}" ]]; then
+                echo "VLLM_PATCHED_INSTALL_MODE=git requires VLLM_PATCHED_REPO and VLLM_PATCHED_REF"
+                return 1
+            fi
+            local checkout_dir="${VLLM_PATCHED_GIT_DIR:-/tmp/patched-vllm}"
+            rm -rf "$checkout_dir"
+            git clone --depth 1 --branch "$VLLM_PATCHED_REF" "$VLLM_PATCHED_REPO" "$checkout_dir" || {
+                git clone "$VLLM_PATCHED_REPO" "$checkout_dir"
+                git -C "$checkout_dir" checkout "$VLLM_PATCHED_REF"
+            }
+            echo "Installing patched vLLM from $VLLM_PATCHED_REPO@$VLLM_PATCHED_REF"
+            VLLM_USE_PRECOMPILED=${VLLM_USE_PRECOMPILED:-1} \
+                python3 -m pip install --no-cache-dir -e "$checkout_dir"
+            ;;
+        editable)
+            if [[ -z "${VLLM_PATCHED_CHECKOUT:-}" ]]; then
+                echo "VLLM_PATCHED_INSTALL_MODE=editable requires VLLM_PATCHED_CHECKOUT"
+                return 1
+            fi
+            echo "Installing patched vLLM editable checkout: $VLLM_PATCHED_CHECKOUT"
+            VLLM_USE_PRECOMPILED=${VLLM_USE_PRECOMPILED:-1} \
+                python3 -m pip install --no-cache-dir -e "$VLLM_PATCHED_CHECKOUT"
+            ;;
+        *)
+            echo "Unknown VLLM_PATCHED_INSTALL_MODE: $mode"
+            return 1
+            ;;
+    esac
+
+    python3 - <<'PY'
+import importlib.metadata
+try:
+    print("Installed vLLM version:", importlib.metadata.version("vllm"))
+except importlib.metadata.PackageNotFoundError:
+    print("Installed vLLM version: unknown")
+PY
+}
diff --git a/docs/vllm-patched-distribution.md b/docs/vllm-patched-distribution.md
new file mode 100644
index 000000000..ab94f4245
--- /dev/null
+++ b/docs/vllm-patched-distribution.md
@@ -0,0 +1,94 @@
+# Distributing Patched vLLM for Dynamic Reconfiguration
+
+Dynamic scheduler reconfiguration requires a vLLM build that includes the
+runtime reconfiguration API used by InferenceX:
+
+- `POST /pause`
+- `POST /reconfigure`
+- `POST /resume`
+
+A stock vLLM image or wheel that does not include this API can still run normal
+InferenceX benchmarks. It only needs the patched build when
+`VLLM_DYNAMIC_RECONFIGURE=1` is enabled.
+
+## Option 1: Custom Benchmark Container
+
+Build a container that starts from the normal InferenceX/vLLM benchmark image and
+installs the patched vLLM branch or wheel at image build time. This is the most
+reproducible option for cluster sweeps.
+
+Example Dockerfile pattern:
+
+```Dockerfile
+FROM vllm/vllm-openai:<base-tag>
+
+ARG VLLM_REPO=https://github.com/<org>/vllm.git
+ARG VLLM_REF=<patched-commit-or-branch>
+
+RUN git clone "$VLLM_REPO" /opt/patched-vllm \
+ && cd /opt/patched-vllm \
+ && git checkout "$VLLM_REF" \
+ && VLLM_USE_PRECOMPILED=1 python3 -m pip install --no-cache-dir -e .
+
+LABEL org.opencontainers.image.source-vllm="$VLLM_REPO"
+LABEL org.opencontainers.image.revision-vllm="$VLLM_REF"
+```
+
+Then update the InferenceX config entry to use that image for the vLLM run.
+
+## Option 2: Install a Pinned Wheel Before `vllm serve`
+
+Build a patched wheel once, publish it to an artifact store, and install it in
+the benchmark job before launching `vllm serve`.
+
+```bash
+export VLLM_PATCHED_INSTALL_MODE=wheel
+export VLLM_PATCHED_WHEEL=/workspace/wheels/vllm-<version>-patched.whl
+install_patched_vllm
+```
+
+Remote wheel URLs also work if the runner has access:
+
+```bash
+export VLLM_PATCHED_WHEEL=https://example.internal/wheels/vllm-patched.whl
+install_patched_vllm
+```
+
+## Option 3: Mount a Patched Checkout and Install Editable
+
+This is useful for fast experiments, but less reproducible than an image or
+wheel. Mount the patched vLLM checkout into the job and run:
+
+```bash
+export VLLM_PATCHED_INSTALL_MODE=editable
+export VLLM_PATCHED_CHECKOUT=/workspace/vllm-patched
+install_patched_vllm
+```
+
+## Option 4: Clone and Install a Pinned Git Ref
+
+This is convenient when the cluster has network access to the patched branch:
+
+```bash
+export VLLM_PATCHED_INSTALL_MODE=git
+export VLLM_PATCHED_REPO=https://github.com/<org>/vllm.git
+export VLLM_PATCHED_REF=<patched-commit-or-branch>
+install_patched_vllm
+```
+
+For reproducible benchmark results, prefer a commit SHA over a mutable branch.
+
+## Recommended Cluster Flow
+
+1. Start from a custom image or call `install_patched_vllm` before `vllm serve`.
+2. Launch vLLM with the largest static capacity needed by the sweep.
+3. Enable scheduler reconfiguration for the sweep:
+
+```bash
+export VLLM_DYNAMIC_RECONFIGURE=1
+export VLLM_MAX_NUM_BATCHED_TOKENS=32768
+export VLLM_MAX_NUM_SEQS="$CONC"
+```
+
+4. Run `run_benchmark_serving` as usual.
+5. Record the patched vLLM commit SHA or wheel URL in the run notes/results.

From d35f6c331431b618d4cf7b69157d09d8b4949736 Mon Sep 17 00:00:00 2001
From: Jordan Nanos <jordan.nanos@gmail.com>
Date: Tue, 14 Apr 2026 14:44:18 -0700
Subject: [PATCH 4/8] Fix pause mode: use mode=keep (required by reconfigure
 API)

The vLLM /reconfigure endpoint requires PAUSED_ALL state, which maps to
pause mode="keep". Using mode="abort" would leave the scheduler in
PAUSED_NEW state, causing reconfigure to reject the request.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 benchmarks/benchmark_lib.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index e138a93eb..ba9c39ae9 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -44,7 +44,7 @@ reconfigure_vllm_scheduler() {
     fi
 
     echo "Reconfiguring vLLM scheduler at $base_url: $json"
-    curl -fsS -X POST "$base_url/pause?mode=abort&clear_cache=true"
+    curl -fsS -X POST "$base_url/pause?mode=keep&clear_cache=true"
     curl -fsS -X POST "$base_url/reconfigure" \
         -H "Content-Type: application/json" \
         -d "$json"

From f9aa78c4fd4fb7c9eaa138eee7d260d299532474 Mon Sep 17 00:00:00 2001
From: Jordan Nanos <jordan.nanos@gmail.com>
Date: Tue, 14 Apr 2026 14:45:58 -0700
Subject: [PATCH 5/8] Fix four bugs in reconfigure integration

1. Double reconfigure in test_reconfigure_sweep.sh: Phase B called
   reconfigure_vllm_scheduler manually then run_benchmark_serving
   called it again via the VLLM_DYNAMIC_RECONFIGURE hook. Remove the
   manual call and let the hook handle it.

2. Doc listed mode=abort but vLLM /reconfigure requires PAUSED_ALL
   which maps to mode=keep. Fix the Requirements section.

3. No error recovery in reconfigure_vllm_scheduler: if /reconfigure
   failed, curl exited non-zero, set -e killed the function, and the
   server stayed paused forever. Now capture the exit code, always
   call /resume, then propagate the error.

4. --force-reinstall in wheel mode reinstalls all dependencies. Use
   --no-deps --force-reinstall to only replace the vllm package.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 benchmarks/benchmark_lib.sh                | 13 +++++++++++--
 benchmarks/test_reconfigure_sweep.sh       |  7 +++----
 docs/vllm-dynamic-scheduler-reconfigure.md |  2 +-
 3 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index ba9c39ae9..608261a06 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -45,10 +45,19 @@ reconfigure_vllm_scheduler() {
 
     echo "Reconfiguring vLLM scheduler at $base_url: $json"
     curl -fsS -X POST "$base_url/pause?mode=keep&clear_cache=true"
+
+    local rc=0
     curl -fsS -X POST "$base_url/reconfigure" \
         -H "Content-Type: application/json" \
-        -d "$json"
+        -d "$json" || rc=$?
+
+    # Always resume so the server is never left paused on failure.
     curl -fsS -X POST "$base_url/resume"
+
+    if [[ "$rc" -ne 0 ]]; then
+        echo "ERROR: /reconfigure failed (curl exit code $rc)" >&2
+        return "$rc"
+    fi
 }
 
 # --------------------------------
@@ -889,7 +898,7 @@ install_patched_vllm() {
                 return 1
             fi
             echo "Installing patched vLLM wheel: $VLLM_PATCHED_WHEEL"
-            python3 -m pip install --no-cache-dir --force-reinstall "$VLLM_PATCHED_WHEEL"
+            python3 -m pip install --no-cache-dir --no-deps --force-reinstall "$VLLM_PATCHED_WHEEL"
             ;;
         git)
             if [[ -z "${VLLM_PATCHED_REPO:-}" || -z "${VLLM_PATCHED_REF:-}" ]]; then
diff --git a/benchmarks/test_reconfigure_sweep.sh b/benchmarks/test_reconfigure_sweep.sh
index 815db2355..3a7d5f08d 100755
--- a/benchmarks/test_reconfigure_sweep.sh
+++ b/benchmarks/test_reconfigure_sweep.sh
@@ -165,13 +165,12 @@ if [[ "${SKIP_RECONFIG:-0}" != "1" ]]; then
 
         RECONF_START=$(date +%s)
 
+        # Set env vars — run_benchmark_serving picks these up via
+        # the VLLM_DYNAMIC_RECONFIGURE hook and calls
+        # reconfigure_vllm_scheduler automatically.
         export VLLM_DYNAMIC_RECONFIGURE=1
         export VLLM_MAX_NUM_BATCHED_TOKENS="$mnb"
         export VLLM_MAX_NUM_SEQS="$mns"
-        reconfigure_vllm_scheduler "$PORT"
-
-        RECONF_END=$(date +%s)
-        echo "  Reconfigure: $((RECONF_END - RECONF_START))s"
 
         run_bench "$RESULTS_B" "reconfig_mnb${mnb}_mns${mns}"
 
diff --git a/docs/vllm-dynamic-scheduler-reconfigure.md b/docs/vllm-dynamic-scheduler-reconfigure.md
index 7db5c7b07..75dbd90bd 100644
--- a/docs/vllm-dynamic-scheduler-reconfigure.md
+++ b/docs/vllm-dynamic-scheduler-reconfigure.md
@@ -10,7 +10,7 @@ admission limits.
 
 This feature requires a vLLM build that exposes these HTTP endpoints:
 
-- `POST /pause?mode=abort&clear_cache=true`
+- `POST /pause?mode=keep&clear_cache=true`
 - `POST /reconfigure` (JSON body)
 - `POST /resume`
 

From 7c5859db83b02a687ab98eb86dc4b1dfa65e14df Mon Sep 17 00:00:00 2001
From: Jordan Nanos <jordan.nanos@gmail.com>
Date: Tue, 14 Apr 2026 15:08:15 -0700
Subject: [PATCH 6/8] Update docs with pre-built Docker Hub image and run
 instructions

Image: semianalysiswork/vllm-reconfigure:latest
Based on vllm/vllm-openai:v0.18.0 with reconfigure API overlay.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 docs/vllm-dynamic-scheduler-reconfigure.md | 52 ++++++++++++++--------
 1 file changed, 33 insertions(+), 19 deletions(-)

diff --git a/docs/vllm-dynamic-scheduler-reconfigure.md b/docs/vllm-dynamic-scheduler-reconfigure.md
index 75dbd90bd..8d80a3d49 100644
--- a/docs/vllm-dynamic-scheduler-reconfigure.md
+++ b/docs/vllm-dynamic-scheduler-reconfigure.md
@@ -58,32 +58,46 @@ export MODEL=openai/gpt-oss-120b TP=8 CONC=32 ISL=1024 OSL=1024
 bash benchmarks/test_reconfigure_sweep.sh
 ```
 
-## Building the Patched Image
+## Pre-built Image
 
-The changes are pure Python -- no C++/CUDA recompilation needed. Overlay them
-onto the stock vLLM image:
+A pre-built image is available on Docker Hub:
 
-```bash
-# From the vllm repo root (on the branch with the reconfigure patch)
-docker build -f docker/Dockerfile.reconfigure-overlay \
-  -t ghcr.io/semianalysisai/vllm-reconfigure:test1 .
+    semianalysiswork/vllm-reconfigure:latest
+
+This overlays the reconfigure API onto `vllm/vllm-openai:v0.18.0`.
+Source: [JordanNanos/vllm `feature/reconfigure-scheduler`](https://github.com/JordanNanos/vllm/tree/feature/reconfigure-scheduler),
+Dockerfile: `docker/Dockerfile.single-node-nvidia`.
+
+## Running a Single-Node Test
 
-docker push ghcr.io/semianalysisai/vllm-reconfigure:test1
+```bash
+docker run --rm --init --network host \
+  --runtime nvidia --gpus all --ipc host --privileged \
+  --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \
+  -v $HF_HUB_CACHE:/root/.cache/huggingface \
+  -v $(pwd):/workspace -w /workspace \
+  -e HF_TOKEN -e PORT=8888 \
+  -e MODEL=openai/gpt-oss-120b \
+  -e TP=8 -e CONC=32 \
+  -e ISL=1024 -e OSL=1024 \
+  semianalysiswork/vllm-reconfigure:latest \
+  bash benchmarks/test_reconfigure_sweep.sh
 ```
 
-Or patch at runtime by mounting the vLLM checkout and running the overlay script
-at the top of the benchmark:
+Use `SKIP_BASELINE=1` or `SKIP_RECONFIG=1` to run only one phase.
+Pass extra vLLM flags via `VLLM_EXTRA_ARGS` (e.g. `--kv-cache-dtype fp8`).
+
+## Building the Image From Source
+
+To rebuild from the vLLM fork:
 
 ```bash
-docker run --gpus all --rm -it --network host --shm-size 64g \
-  -v /path/to/vllm:/workspace/vllm-patch:ro \
-  -v /path/to/inferencex:/workspace \
-  vllm/vllm-openai:v0.15.1 \
-  bash -c '
-    bash /workspace/vllm-patch/docker/apply-reconfigure-overlay.sh
-    export MODEL=openai/gpt-oss-120b TP=8 CONC=32 ISL=1024 OSL=1024
-    bash /workspace/benchmarks/test_reconfigure_sweep.sh
-  '
+git clone https://github.com/JordanNanos/vllm.git -b feature/reconfigure-scheduler
+cd vllm
+docker build --platform linux/amd64 \
+  -f docker/Dockerfile.single-node-nvidia \
+  --build-arg BASE_IMAGE=vllm/vllm-openai:v0.18.0 \
+  -t semianalysiswork/vllm-reconfigure:latest .
 ```
 
 ## Safety Notes

From c0859c175de6a5be6f25e999f1a1effe4acdf78e Mon Sep 17 00:00:00 2001
From: Jordan Nanos <jordan.nanos@gmail.com>
Date: Tue, 14 Apr 2026 17:54:10 -0700
Subject: [PATCH 7/8] Add workflow for scheduler reconfigure A/B test

Standalone workflow_dispatch workflow that runs
benchmarks/test_reconfigure_sweep.sh on any GPU runner using the
semianalysiswork/vllm-reconfigure:latest image.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .github/workflows/test-reconfigure.yml | 116 +++++++++++++++++++++++++
 1 file changed, 116 insertions(+)
 create mode 100644 .github/workflows/test-reconfigure.yml

diff --git a/.github/workflows/test-reconfigure.yml b/.github/workflows/test-reconfigure.yml
new file mode 100644
index 000000000..2d63b0ff8
--- /dev/null
+++ b/.github/workflows/test-reconfigure.yml
@@ -0,0 +1,116 @@
+name: Test - Scheduler Reconfigure A/B
+run-name: "Reconfigure A/B | ${{ inputs.model }} tp=${{ inputs.tp }} conc=${{ inputs.conc }} | ${{ inputs.runner }}"
+
+on:
+  workflow_dispatch:
+    inputs:
+      runner:
+        description: "Self-hosted runner label (e.g. b200-dgxc_1)"
+        required: true
+        type: string
+      model:
+        description: "HuggingFace model path"
+        required: true
+        type: string
+        default: "openai/gpt-oss-120b"
+      tp:
+        description: "Tensor parallel size"
+        required: true
+        type: string
+        default: "8"
+      conc:
+        description: "Benchmark concurrency"
+        required: true
+        type: string
+        default: "32"
+      isl:
+        description: "Input sequence length"
+        required: true
+        type: string
+        default: "1024"
+      osl:
+        description: "Output sequence length"
+        required: true
+        type: string
+        default: "1024"
+      extra-args:
+        description: "Extra vllm serve args (e.g. --kv-cache-dtype fp8)"
+        required: false
+        type: string
+      skip-baseline:
+        description: "Skip Phase A (baseline cold starts)"
+        required: false
+        type: boolean
+        default: false
+      skip-reconfig:
+        description: "Skip Phase B (reconfigure)"
+        required: false
+        type: boolean
+        default: false
+
+env:
+  IMAGE: semianalysiswork/vllm-reconfigure:latest
+
+jobs:
+  reconfigure-ab-test:
+    runs-on: ${{ inputs.runner }}
+    timeout-minutes: 300
+    name: "reconfigure-ab | ${{ inputs.model }} | ${{ inputs.runner }}"
+    steps:
+      - name: Resource cleanup (pre-run)
+        run: |
+          if command -v docker >/dev/null 2>&1 && docker info >/dev/null 2>&1; then
+            docker ps -aq | xargs -r docker rm -f || true
+            docker network prune -f || true
+          fi
+
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        with:
+          ref: ${{ github.ref }}
+          clean: false
+
+      - name: Pull image
+        run: docker pull "$IMAGE"
+
+      - name: Run A/B test
+        run: |
+          set -x
+          HF_HUB_CACHE_MOUNT="/raid/hf_hub_cache/"
+
+          docker run --rm --init --network host \
+            --runtime nvidia --gpus all --ipc host --privileged \
+            --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \
+            -v "${HF_HUB_CACHE_MOUNT}:/root/.cache/huggingface" \
+            -v "$GITHUB_WORKSPACE:/workspace" -w /workspace \
+            -e HF_TOKEN \
+            -e MODEL="${{ inputs.model }}" \
+            -e TP="${{ inputs.tp }}" \
+            -e CONC="${{ inputs.conc }}" \
+            -e ISL="${{ inputs.isl }}" \
+            -e OSL="${{ inputs.osl }}" \
+            -e PORT=8888 \
+            -e RANDOM_RANGE_RATIO=0.8 \
+            -e VLLM_EXTRA_ARGS="${{ inputs.extra-args }}" \
+            -e SKIP_BASELINE="${{ inputs.skip-baseline && '1' || '0' }}" \
+            -e SKIP_RECONFIG="${{ inputs.skip-reconfig && '1' || '0' }}" \
+            -e RESULT_FILENAME="reconfigure_ab_test" \
+            -e NCCL_GRAPH_REGISTER=0 \
+            -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \
+            "$IMAGE" \
+            bash benchmarks/test_reconfigure_sweep.sh
+
+      - name: Upload results
+        if: always()
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        with:
+          name: reconfigure-ab-results-${{ inputs.runner }}
+          path: |
+            results_reconfigure_test/
+          retention-days: 30
+
+      - name: Resource cleanup (post-run)
+        if: always()
+        run: |
+          if command -v docker >/dev/null 2>&1 && docker info >/dev/null 2>&1; then
+            docker ps -aq | xargs -r docker rm -f || true
+          fi

From fd4704636a4c0ba5763e32df6d4254895520e477 Mon Sep 17 00:00:00 2001
From: Jordan Nanos <jordan.nanos@gmail.com>
Date: Tue, 14 Apr 2026 17:58:09 -0700
Subject: [PATCH 8/8] Potential fix for pull request finding 'CodeQL / Workflow
 does not contain permissions'

Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com>
---
 .github/workflows/test-reconfigure.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/test-reconfigure.yml b/.github/workflows/test-reconfigure.yml
index 2d63b0ff8..e701c8408 100644
--- a/.github/workflows/test-reconfigure.yml
+++ b/.github/workflows/test-reconfigure.yml
@@ -48,6 +48,9 @@ on:
         type: boolean
         default: false
 
+permissions:
+  contents: read
+
 env:
   IMAGE: semianalysiswork/vllm-reconfigure:latest