diff --git a/.github/workflows/test-reconfigure.yml b/.github/workflows/test-reconfigure.yml
new file mode 100644
index 000000000..e701c8408
--- /dev/null
+++ b/.github/workflows/test-reconfigure.yml
@@ -0,0 +1,119 @@
+name: Test - Scheduler Reconfigure A/B
+run-name: "Reconfigure A/B | ${{ inputs.model }} tp=${{ inputs.tp }} conc=${{ inputs.conc }} | ${{ inputs.runner }}"
+
+on:
+  workflow_dispatch:
+    inputs:
+      runner:
+        description: "Self-hosted runner label (e.g. b200-dgxc_1)"
+        required: true
+        type: string
+      model:
+        description: "HuggingFace model path"
+        required: true
+        type: string
+        default: "openai/gpt-oss-120b"
+      tp:
+        description: "Tensor parallel size"
+        required: true
+        type: string
+        default: "8"
+      conc:
+        description: "Benchmark concurrency"
+        required: true
+        type: string
+        default: "32"
+      isl:
+        description: "Input sequence length"
+        required: true
+        type: string
+        default: "1024"
+      osl:
+        description: "Output sequence length"
+        required: true
+        type: string
+        default: "1024"
+      extra-args:
+        description: "Extra vllm serve args (e.g. --kv-cache-dtype fp8)"
+        required: false
+        type: string
+      skip-baseline:
+        description: "Skip Phase A (baseline cold starts)"
+        required: false
+        type: boolean
+        default: false
+      skip-reconfig:
+        description: "Skip Phase B (reconfigure)"
+        required: false
+        type: boolean
+        default: false
+
+permissions:
+  contents: read
+
+env:
+  IMAGE: semianalysiswork/vllm-reconfigure:latest
+
+jobs:
+  reconfigure-ab-test:
+    runs-on: ${{ inputs.runner }}
+    timeout-minutes: 300
+    name: "reconfigure-ab | ${{ inputs.model }} | ${{ inputs.runner }}"
+    steps:
+      - name: Resource cleanup (pre-run)
+        run: |
+          if command -v docker >/dev/null 2>&1 && docker info >/dev/null 2>&1; then
+            docker ps -aq | xargs -r docker rm -f || true
+            docker network prune -f || true
+          fi
+
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        with:
+          ref: ${{ github.ref }}
+          clean: false
+
+      - name: Pull image
+        run: docker pull "$IMAGE"
+
+      - name: Run A/B test
+        run: |
+          set -x
+          HF_HUB_CACHE_MOUNT="/raid/hf_hub_cache/"
+
+          docker run --rm --init --network host \
+            --runtime nvidia --gpus all --ipc host --privileged \
+            --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \
+            -v "${HF_HUB_CACHE_MOUNT}:/root/.cache/huggingface" \
+            -v "$GITHUB_WORKSPACE:/workspace" -w /workspace \
+            -e HF_TOKEN \
+            -e MODEL="${{ inputs.model }}" \
+            -e TP="${{ inputs.tp }}" \
+            -e CONC="${{ inputs.conc }}" \
+            -e ISL="${{ inputs.isl }}" \
+            -e OSL="${{ inputs.osl }}" \
+            -e PORT=8888 \
+            -e RANDOM_RANGE_RATIO=0.8 \
+            -e VLLM_EXTRA_ARGS="${{ inputs.extra-args }}" \
+            -e SKIP_BASELINE="${{ inputs.skip-baseline && '1' || '0' }}" \
+            -e SKIP_RECONFIG="${{ inputs.skip-reconfig && '1' || '0' }}" \
+            -e RESULT_FILENAME="reconfigure_ab_test" \
+            -e NCCL_GRAPH_REGISTER=0 \
+            -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \
+            "$IMAGE" \
+            bash benchmarks/test_reconfigure_sweep.sh
+
+      - name: Upload results
+        if: always()
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        with:
+          name: reconfigure-ab-results-${{ inputs.runner }}
+          path: |
+            results_reconfigure_test/
+          retention-days: 30
+
+      - name: Resource cleanup (post-run)
+        if: always()
+        run: |
+          if command -v docker >/dev/null 2>&1 && docker info >/dev/null 2>&1; then
+            docker ps -aq | xargs -r docker rm -f || true
+          fi
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 535313252..608261a06 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -9,6 +9,57 @@ export PYTHONDONTWRITEBYTECODE=1
 export PYTHONPYCACHEPREFIX="${PYTHONPYCACHEPREFIX:-/tmp/inferencex-pycache}"
 mkdir -p "$PYTHONPYCACHEPREFIX" 2>/dev/null || true
 
+# --------------------------------
+# vLLM dynamic scheduler reconfiguration
+# --------------------------------
+
+# Reconfigure vLLM scheduler limits on a running endpoint. This requires a vLLM
+# build that exposes POST /pause, POST /reconfigure, and POST /resume.
+# The feature is opt-in via VLLM_DYNAMIC_RECONFIGURE=1 and is intended for
+# single-server sweeps where model, parallelism, and cache layout stay fixed.
+#
+# Supported env vars (set before calling):
+#   VLLM_MAX_NUM_BATCHED_TOKENS  -- max tokens scheduled per step
+#   VLLM_MAX_NUM_SEQS            -- max concurrent sequences
+reconfigure_vllm_scheduler() {
+    local port="$1"
+    local base_url="${VLLM_DYNAMIC_RECONFIGURE_BASE_URL:-http://0.0.0.0:$port}"
+
+    # Build JSON body from set env vars
+    local json="{"
+    local sep=""
+    if [[ -n "${VLLM_MAX_NUM_BATCHED_TOKENS:-}" ]]; then
+        json+="${sep}\"max_num_batched_tokens\":${VLLM_MAX_NUM_BATCHED_TOKENS}"
+        sep=","
+    fi
+    if [[ -n "${VLLM_MAX_NUM_SEQS:-}" ]]; then
+        json+="${sep}\"max_num_seqs\":${VLLM_MAX_NUM_SEQS}"
+        sep=","
+    fi
+    json+="}"
+
+    if [[ "$json" == "{}" ]]; then
+        echo "VLLM_DYNAMIC_RECONFIGURE=1 but no VLLM scheduler parameters were set"
+        return 1
+    fi
+
+    echo "Reconfiguring vLLM scheduler at $base_url: $json"
+    curl -fsS -X POST "$base_url/pause?mode=keep&clear_cache=true"
+
+    local rc=0
+    curl -fsS -X POST "$base_url/reconfigure" \
+        -H "Content-Type: application/json" \
+        -d "$json" || rc=$?
+
+    # Always resume so the server is never left paused on failure.
+    curl -fsS -X POST "$base_url/resume"
+
+    if [[ "$rc" -ne 0 ]]; then
+        echo "ERROR: /reconfigure failed (curl exit code $rc)" >&2
+        return "$rc"
+    fi
+}
+
 # --------------------------------
 # GPU monitoring helpers
 # --------------------------------
@@ -326,6 +377,10 @@ run_benchmark_serving() {
         num_prompts="$max_concurrency"
     fi
 
+    if [[ "${VLLM_DYNAMIC_RECONFIGURE:-0}" == "1" && "$backend" == "vllm" ]]; then
+        reconfigure_vllm_scheduler "$port"
+    fi
+
     # Build benchmark command
     local benchmark_cmd=(
         python3 "$workspace_dir/utils/bench_serving/benchmark_serving.py"
@@ -805,3 +860,81 @@ run_eval() {
     fi
     return $eval_rc
 }
+
+# --------------------------------
+# Patched vLLM distribution helpers
+# --------------------------------
+
+# Install a patched vLLM build before launching `vllm serve`. This is optional
+# and only needed when the active container/image does not already include the
+# dynamic scheduler reconfiguration API.
+#
+# Supported env vars:
+#   VLLM_PATCHED_WHEEL        -- local or remote wheel path/URL
+#   VLLM_PATCHED_REPO         -- git repository URL for patched vLLM
+#   VLLM_PATCHED_REF          -- branch, tag, or commit for VLLM_PATCHED_REPO
+#   VLLM_PATCHED_CHECKOUT     -- existing mounted checkout to install editable
+#   VLLM_PATCHED_INSTALL_MODE -- wheel, git, editable, or auto (default)
+install_patched_vllm() {
+    local mode="${VLLM_PATCHED_INSTALL_MODE:-auto}"
+
+    if [[ "$mode" == "auto" ]]; then
+        if [[ -n "${VLLM_PATCHED_WHEEL:-}" ]]; then
+            mode="wheel"
+        elif [[ -n "${VLLM_PATCHED_CHECKOUT:-}" ]]; then
+            mode="editable"
+        elif [[ -n "${VLLM_PATCHED_REPO:-}" ]]; then
+            mode="git"
+        else
+            echo "No patched vLLM install source configured; using existing vLLM"
+            return 0
+        fi
+    fi
+
+    case "$mode" in
+        wheel)
+            if [[ -z "${VLLM_PATCHED_WHEEL:-}" ]]; then
+                echo "VLLM_PATCHED_INSTALL_MODE=wheel requires VLLM_PATCHED_WHEEL"
+                return 1
+            fi
+            echo "Installing patched vLLM wheel: $VLLM_PATCHED_WHEEL"
+            python3 -m pip install --no-cache-dir --no-deps --force-reinstall "$VLLM_PATCHED_WHEEL"
+            ;;
+        git)
+            if [[ -z "${VLLM_PATCHED_REPO:-}" || -z "${VLLM_PATCHED_REF:-}" ]]; then
+                echo "VLLM_PATCHED_INSTALL_MODE=git requires VLLM_PATCHED_REPO and VLLM_PATCHED_REF"
+                return 1
+            fi
+            local checkout_dir="${VLLM_PATCHED_GIT_DIR:-/tmp/patched-vllm}"
+            rm -rf "$checkout_dir"
+            git clone --depth 1 --branch "$VLLM_PATCHED_REF" "$VLLM_PATCHED_REPO" "$checkout_dir" || {
+                git clone "$VLLM_PATCHED_REPO" "$checkout_dir"
+                git -C "$checkout_dir" checkout "$VLLM_PATCHED_REF"
+            }
+            echo "Installing patched vLLM from $VLLM_PATCHED_REPO@$VLLM_PATCHED_REF"
+            VLLM_USE_PRECOMPILED=${VLLM_USE_PRECOMPILED:-1} \
+                python3 -m pip install --no-cache-dir -e "$checkout_dir"
+            ;;
+        editable)
+            if [[ -z "${VLLM_PATCHED_CHECKOUT:-}" ]]; then
+                echo "VLLM_PATCHED_INSTALL_MODE=editable requires VLLM_PATCHED_CHECKOUT"
+                return 1
+            fi
+            echo "Installing patched vLLM editable checkout: $VLLM_PATCHED_CHECKOUT"
+            VLLM_USE_PRECOMPILED=${VLLM_USE_PRECOMPILED:-1} \
+                python3 -m pip install --no-cache-dir -e "$VLLM_PATCHED_CHECKOUT"
+            ;;
+        *)
+            echo "Unknown VLLM_PATCHED_INSTALL_MODE: $mode"
+            return 1
+            ;;
+    esac
+
+    python3 - <<'PY'
+import importlib.metadata
+try:
+    print("Installed vLLM version:", importlib.metadata.version("vllm"))
+except importlib.metadata.PackageNotFoundError:
+    print("Installed vLLM version: unknown")
+PY
+}
diff --git a/benchmarks/test_reconfigure_sweep.sh b/benchmarks/test_reconfigure_sweep.sh
new file mode 100755
index 000000000..3a7d5f08d
--- /dev/null
+++ b/benchmarks/test_reconfigure_sweep.sh
@@ -0,0 +1,204 @@
+#!/usr/bin/env bash
+# A/B test: compare N cold starts (baseline) vs 1 start + N reconfigure cycles.
+#
+# Run this directly on a GPU node inside a vLLM container that includes the
+# /reconfigure endpoint. Mount the inferencex workspace at /workspace.
+#
+# Required env vars:
+#   MODEL   -- HuggingFace model path  (e.g. openai/gpt-oss-120b)
+#   TP      -- tensor-parallel size    (e.g. 8)
+#   CONC    -- benchmark concurrency   (e.g. 32)
+#   ISL     -- input sequence length   (e.g. 1024)
+#   OSL     -- output sequence length  (e.g. 1024)
+#
+# Optional:
+#   PORT              -- server port (default 8888)
+#   VLLM_EXTRA_ARGS   -- extra args passed to vllm serve (e.g. "--kv-cache-dtype fp8")
+#   SKIP_BASELINE     -- set to 1 to skip the baseline phase
+#   SKIP_RECONFIG     -- set to 1 to skip the reconfigure phase
+#
+# Usage:
+#   export MODEL=openai/gpt-oss-120b TP=8 CONC=32 ISL=1024 OSL=1024
+#   bash benchmarks/test_reconfigure_sweep.sh
+set -euo pipefail
+
+source "$(dirname "$0")/benchmark_lib.sh"
+
+check_env_vars MODEL TP CONC ISL OSL
+
+PORT=${PORT:-8888}
+MAX_MODEL_LEN=$(( ISL + OSL + 256 ))
+NUM_PROMPTS=$(( CONC * 10 ))
+
+# Parameter grid to sweep
+MNB_VALUES=(4096 8192 16384)
+MNS_VALUES=(256 512)
+GRID_SIZE=$(( ${#MNB_VALUES[@]} * ${#MNS_VALUES[@]} ))
+
+RESULTS_BASE=/workspace/results_reconfigure_test
+RESULTS_A="${RESULTS_BASE}/baseline"
+RESULTS_B="${RESULTS_BASE}/reconfig"
+mkdir -p "$RESULTS_A" "$RESULTS_B"
+
+SERVER_LOG_DIR="${RESULTS_BASE}/logs"
+mkdir -p "$SERVER_LOG_DIR"
+
+start_server() {
+    local mnb="$1" mns="$2" log="$3"
+
+    vllm serve "$MODEL" \
+        --host 0.0.0.0 --port "$PORT" \
+        --tensor-parallel-size "$TP" \
+        --gpu-memory-utilization 0.9 \
+        --max-model-len "$MAX_MODEL_LEN" \
+        --max-num-batched-tokens "$mnb" \
+        --max-num-seqs "$mns" \
+        --no-enable-prefix-caching \
+        --disable-log-requests \
+        ${VLLM_EXTRA_ARGS:-} \
+        > "$log" 2>&1 &
+    SERVER_PID=$!
+
+    wait_for_server_ready \
+        --port "$PORT" \
+        --server-log "$log" \
+        --server-pid "$SERVER_PID"
+}
+
+kill_server() {
+    if [[ -n "${SERVER_PID:-}" ]]; then
+        kill "$SERVER_PID" 2>/dev/null || true
+        wait "$SERVER_PID" 2>/dev/null || true
+        unset SERVER_PID
+        sleep 2
+    fi
+}
+
+run_bench() {
+    local result_dir="$1" result_name="$2"
+
+    run_benchmark_serving \
+        --model "$MODEL" \
+        --port "$PORT" \
+        --backend vllm \
+        --input-len "$ISL" \
+        --output-len "$OSL" \
+        --random-range-ratio 0.8 \
+        --num-prompts "$NUM_PROMPTS" \
+        --max-concurrency "$CONC" \
+        --result-filename "$result_name" \
+        --result-dir "$result_dir" \
+        --server-pid "$SERVER_PID"
+}
+
+trap kill_server EXIT
+
+pip install -q datasets pandas 2>/dev/null || true
+
+# ──────────────────────────────────────────────
+# Phase A: Baseline — separate server per config
+# ──────────────────────────────────────────────
+if [[ "${SKIP_BASELINE:-0}" != "1" ]]; then
+    echo ""
+    echo "###############################################"
+    echo "# Phase A: Baseline (${GRID_SIZE} cold starts)"
+    echo "###############################################"
+
+    A_START=$(date +%s)
+    A_RUN=0
+
+    for mnb in "${MNB_VALUES[@]}"; do
+      for mns in "${MNS_VALUES[@]}"; do
+        A_RUN=$((A_RUN + 1))
+        echo ""
+        echo "--- A.$A_RUN: max_num_batched_tokens=$mnb max_num_seqs=$mns ---"
+
+        RUN_START=$(date +%s)
+        start_server "$mnb" "$mns" "${SERVER_LOG_DIR}/server_a_${A_RUN}.log"
+        READY_TIME=$(date +%s)
+        echo "  Startup: $((READY_TIME - RUN_START))s"
+
+        run_bench "$RESULTS_A" "baseline_mnb${mnb}_mns${mns}"
+
+        kill_server
+        RUN_END=$(date +%s)
+        echo "  Total: $((RUN_END - RUN_START))s"
+      done
+    done
+
+    A_END=$(date +%s)
+    A_TOTAL=$((A_END - A_START))
+    echo ""
+    echo "Phase A total: ${A_TOTAL}s"
+else
+    A_TOTAL="(skipped)"
+fi
+
+# ──────────────────────────────────────────────
+# Phase B: Reconfigure — single server, N cycles
+# ──────────────────────────────────────────────
+if [[ "${SKIP_RECONFIG:-0}" != "1" ]]; then
+    echo ""
+    echo "###############################################"
+    echo "# Phase B: Reconfigure (1 cold start)"
+    echo "###############################################"
+
+    B_START=$(date +%s)
+
+    # Start with the largest values so CUDA graphs and KV cache cover all configs
+    INIT_MNB=${MNB_VALUES[-1]}
+    INIT_MNS=${MNS_VALUES[-1]}
+
+    echo ""
+    echo "--- Starting server (mnb=$INIT_MNB mns=$INIT_MNS) ---"
+    STARTUP_START=$(date +%s)
+    start_server "$INIT_MNB" "$INIT_MNS" "${SERVER_LOG_DIR}/server_b.log"
+    STARTUP_END=$(date +%s)
+    echo "  Startup: $((STARTUP_END - STARTUP_START))s"
+
+    B_RUN=0
+    for mnb in "${MNB_VALUES[@]}"; do
+      for mns in "${MNS_VALUES[@]}"; do
+        B_RUN=$((B_RUN + 1))
+        echo ""
+        echo "--- B.$B_RUN: max_num_batched_tokens=$mnb max_num_seqs=$mns ---"
+
+        RECONF_START=$(date +%s)
+
+        # Set env vars — run_benchmark_serving picks these up via
+        # the VLLM_DYNAMIC_RECONFIGURE hook and calls
+        # reconfigure_vllm_scheduler automatically.
+        export VLLM_DYNAMIC_RECONFIGURE=1
+        export VLLM_MAX_NUM_BATCHED_TOKENS="$mnb"
+        export VLLM_MAX_NUM_SEQS="$mns"
+
+        run_bench "$RESULTS_B" "reconfig_mnb${mnb}_mns${mns}"
+
+        RUN_END=$(date +%s)
+        echo "  Total: $((RUN_END - RECONF_START))s"
+      done
+    done
+
+    kill_server
+
+    B_END=$(date +%s)
+    B_TOTAL=$((B_END - B_START))
+    B_STARTUP=$((STARTUP_END - STARTUP_START))
+    echo ""
+    echo "Phase B total: ${B_TOTAL}s (startup: ${B_STARTUP}s)"
+else
+    B_TOTAL="(skipped)"
+fi
+
+# ──────────────────────────────────────────────
+# Summary
+# ──────────────────────────────────────────────
+echo ""
+echo "=============================================="
+echo " Comparison"
+echo "=============================================="
+echo " Phase A (baseline, ${GRID_SIZE} cold starts):  ${A_TOTAL}s"
+echo " Phase B (reconfigure, 1 cold start):   ${B_TOTAL}s"
+echo ""
+echo " Results saved to: ${RESULTS_BASE}/"
+echo "=============================================="
diff --git a/docs/vllm-dynamic-scheduler-reconfigure.md b/docs/vllm-dynamic-scheduler-reconfigure.md
new file mode 100644
index 000000000..8d80a3d49
--- /dev/null
+++ b/docs/vllm-dynamic-scheduler-reconfigure.md
@@ -0,0 +1,109 @@
+# vLLM Dynamic Scheduler Reconfiguration
+
+InferenceX can optionally reconfigure selected vLLM scheduler limits between
+benchmark runs without restarting the serving endpoint. This is useful for
+single-server sweeps where the model, parallelism, quantization, max context,
+and KV cache layout stay fixed, but each benchmark case uses different scheduler
+admission limits.
+
+## Requirements
+
+This feature requires a vLLM build that exposes these HTTP endpoints:
+
+- `POST /pause?mode=keep&clear_cache=true`
+- `POST /reconfigure` (JSON body)
+- `POST /resume`
+
+The stock vLLM releases do not provide `/reconfigure`. You need either:
+
+1. A Docker image built from the vLLM branch containing the reconfigure API
+   (see [Building the patched image](#building-the-patched-image) below).
+2. A runtime patch applied at container start from a mounted vLLM checkout.
+
+## Reconfigurable Parameters
+
+| Parameter | Type | Description |
+|-----------|------|-------------|
+| `max_num_batched_tokens` | int > 0 | Max tokens scheduled per step |
+| `max_num_seqs` | int > 0 | Max concurrent sequences |
+| `enable_chunked_prefill` | bool | Toggle chunked prefill |
+| `long_prefill_token_threshold` | int >= 0 | Cap prefill chunk size (0 = no cap) |
+
+Everything else (TP, EP, GPU memory, KV cache dtype, block size, CUDA graphs,
+compilation config, etc.) is baked in at startup and cannot be changed.
+
+## Enabling
+
+Set `VLLM_DYNAMIC_RECONFIGURE=1` and the desired parameter env vars before
+calling `run_benchmark_serving` with `--backend vllm`:
+
+```bash
+export VLLM_DYNAMIC_RECONFIGURE=1
+export VLLM_MAX_NUM_BATCHED_TOKENS=32768
+export VLLM_MAX_NUM_SEQS=128
+```
+
+`run_benchmark_serving` calls `reconfigure_vllm_scheduler` before each benchmark
+run. The helper pauses vLLM, sends a JSON body to `/reconfigure`, and resumes.
+
+## A/B Test Script
+
+`benchmarks/test_reconfigure_sweep.sh` runs back-to-back comparisons:
+
+- **Phase A (baseline):** N cold starts, one per parameter combo
+- **Phase B (reconfigure):** 1 cold start, N reconfigure cycles
+
+```bash
+export MODEL=openai/gpt-oss-120b TP=8 CONC=32 ISL=1024 OSL=1024
+bash benchmarks/test_reconfigure_sweep.sh
+```
+
+## Pre-built Image
+
+A pre-built image is available on Docker Hub:
+
+    semianalysiswork/vllm-reconfigure:latest
+
+This overlays the reconfigure API onto `vllm/vllm-openai:v0.18.0`.
+Source: [JordanNanos/vllm `feature/reconfigure-scheduler`](https://github.com/JordanNanos/vllm/tree/feature/reconfigure-scheduler),
+Dockerfile: `docker/Dockerfile.single-node-nvidia`.
+
+## Running a Single-Node Test
+
+```bash
+docker run --rm --init --network host \
+  --runtime nvidia --gpus all --ipc host --privileged \
+  --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \
+  -v $HF_HUB_CACHE:/root/.cache/huggingface \
+  -v $(pwd):/workspace -w /workspace \
+  -e HF_TOKEN -e PORT=8888 \
+  -e MODEL=openai/gpt-oss-120b \
+  -e TP=8 -e CONC=32 \
+  -e ISL=1024 -e OSL=1024 \
+  semianalysiswork/vllm-reconfigure:latest \
+  bash benchmarks/test_reconfigure_sweep.sh
+```
+
+Use `SKIP_BASELINE=1` or `SKIP_RECONFIG=1` to run only one phase.
+Pass extra vLLM flags via `VLLM_EXTRA_ARGS` (e.g. `--kv-cache-dtype fp8`).
+
+## Building the Image From Source
+
+To rebuild from the vLLM fork:
+
+```bash
+git clone https://github.com/JordanNanos/vllm.git -b feature/reconfigure-scheduler
+cd vllm
+docker build --platform linux/amd64 \
+  -f docker/Dockerfile.single-node-nvidia \
+  --build-arg BASE_IMAGE=vllm/vllm-openai:v0.18.0 \
+  -t semianalysiswork/vllm-reconfigure:latest .
+```
+
+## Safety Notes
+
+Do not use this mechanism to change startup-time engine settings such as model,
+quantization, tensor/data/expert parallelism, KV cache dtype, block size,
+`gpu_memory_utilization`, or `max_model_len`. Launch vLLM with the largest static
+capacity required by the sweep and use dynamic reconfiguration only for scheduler
+limits.
diff --git a/docs/vllm-patched-distribution.md b/docs/vllm-patched-distribution.md
new file mode 100644
index 000000000..ab94f4245
--- /dev/null
+++ b/docs/vllm-patched-distribution.md
@@ -0,0 +1,94 @@
+# Distributing Patched vLLM for Dynamic Reconfiguration
+
+Dynamic scheduler reconfiguration requires a vLLM build that includes the
+runtime reconfiguration API used by InferenceX:
+
+- `POST /pause`
+- `POST /reconfigure`
+- `POST /resume`
+
+A stock vLLM image or wheel that does not include this API can still run normal
+InferenceX benchmarks. It only needs the patched build when
+`VLLM_DYNAMIC_RECONFIGURE=1` is enabled.
+
+## Option 1: Custom Benchmark Container
+
+Build a container that starts from the normal InferenceX/vLLM benchmark image and
+installs the patched vLLM branch or wheel at image build time. This is the most
+reproducible option for cluster sweeps.
+
+Example Dockerfile pattern:
+
+```Dockerfile
+FROM vllm/vllm-openai:<base-tag>
+
+ARG VLLM_REPO=https://github.com/<org>/vllm.git
+ARG VLLM_REF=<patched-commit-or-branch>
+
+RUN git clone "$VLLM_REPO" /opt/patched-vllm \
+ && cd /opt/patched-vllm \
+ && git checkout "$VLLM_REF" \
+ && VLLM_USE_PRECOMPILED=1 python3 -m pip install --no-cache-dir -e .
+
+LABEL org.opencontainers.image.source-vllm="$VLLM_REPO"
+LABEL org.opencontainers.image.revision-vllm="$VLLM_REF"
+```
+
+Then update the InferenceX config entry to use that image for the vLLM run.
+
+## Option 2: Install a Pinned Wheel Before `vllm serve`
+
+Build a patched wheel once, publish it to an artifact store, and install it in
+the benchmark job before launching `vllm serve`.
+
+```bash
+export VLLM_PATCHED_INSTALL_MODE=wheel
+export VLLM_PATCHED_WHEEL=/workspace/wheels/vllm-<version>-patched.whl
+install_patched_vllm
+```
+
+Remote wheel URLs also work if the runner has access:
+
+```bash
+export VLLM_PATCHED_WHEEL=https://example.internal/wheels/vllm-patched.whl
+install_patched_vllm
+```
+
+## Option 3: Mount a Patched Checkout and Install Editable
+
+This is useful for fast experiments, but less reproducible than an image or
+wheel. Mount the patched vLLM checkout into the job and run:
+
+```bash
+export VLLM_PATCHED_INSTALL_MODE=editable
+export VLLM_PATCHED_CHECKOUT=/workspace/vllm-patched
+install_patched_vllm
+```
+
+## Option 4: Clone and Install a Pinned Git Ref
+
+This is convenient when the cluster has network access to the patched branch:
+
+```bash
+export VLLM_PATCHED_INSTALL_MODE=git
+export VLLM_PATCHED_REPO=https://github.com/<org>/vllm.git
+export VLLM_PATCHED_REF=<patched-commit-or-branch>
+install_patched_vllm
+```
+
+For reproducible benchmark results, prefer a commit SHA over a mutable branch.
+
+## Recommended Cluster Flow
+
+1. Start from a custom image or call `install_patched_vllm` before `vllm serve`.
+2. Launch vLLM with the largest static capacity needed by the sweep.
+3. Enable scheduler reconfiguration for the sweep:
+
+```bash
+export VLLM_DYNAMIC_RECONFIGURE=1
+export VLLM_MAX_NUM_BATCHED_TOKENS=32768
+export VLLM_MAX_NUM_SEQS="$CONC"
+```
+
+4. Run `run_benchmark_serving` as usual.
+5. Record the patched vLLM commit SHA or wheel URL in the run notes/results.