diff --git a/.github/workflows/test-reconfigure.yml b/.github/workflows/test-reconfigure.yml new file mode 100644 index 000000000..e701c8408 --- /dev/null +++ b/.github/workflows/test-reconfigure.yml @@ -0,0 +1,119 @@ +name: Test - Scheduler Reconfigure A/B +run-name: "Reconfigure A/B | ${{ inputs.model }} tp=${{ inputs.tp }} conc=${{ inputs.conc }} | ${{ inputs.runner }}" + +on: + workflow_dispatch: + inputs: + runner: + description: "Self-hosted runner label (e.g. b200-dgxc_1)" + required: true + type: string + model: + description: "HuggingFace model path" + required: true + type: string + default: "openai/gpt-oss-120b" + tp: + description: "Tensor parallel size" + required: true + type: string + default: "8" + conc: + description: "Benchmark concurrency" + required: true + type: string + default: "32" + isl: + description: "Input sequence length" + required: true + type: string + default: "1024" + osl: + description: "Output sequence length" + required: true + type: string + default: "1024" + extra-args: + description: "Extra vllm serve args (e.g. --kv-cache-dtype fp8)" + required: false + type: string + skip-baseline: + description: "Skip Phase A (baseline cold starts)" + required: false + type: boolean + default: false + skip-reconfig: + description: "Skip Phase B (reconfigure)" + required: false + type: boolean + default: false + +permissions: + contents: read + +env: + IMAGE: semianalysiswork/vllm-reconfigure:latest + +jobs: + reconfigure-ab-test: + runs-on: ${{ inputs.runner }} + timeout-minutes: 300 + name: "reconfigure-ab | ${{ inputs.model }} | ${{ inputs.runner }}" + steps: + - name: Resource cleanup (pre-run) + run: | + if command -v docker >/dev/null 2>&1 && docker info >/dev/null 2>&1; then + docker ps -aq | xargs -r docker rm -f || true + docker network prune -f || true + fi + + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + ref: ${{ github.ref }} + clean: false + + - name: Pull image + run: docker pull "$IMAGE" + + - name: Run A/B test + run: | + set -x + HF_HUB_CACHE_MOUNT="/raid/hf_hub_cache/" + + docker run --rm --init --network host \ + --runtime nvidia --gpus all --ipc host --privileged \ + --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \ + -v "${HF_HUB_CACHE_MOUNT}:/root/.cache/huggingface" \ + -v "$GITHUB_WORKSPACE:/workspace" -w /workspace \ + -e HF_TOKEN \ + -e MODEL="${{ inputs.model }}" \ + -e TP="${{ inputs.tp }}" \ + -e CONC="${{ inputs.conc }}" \ + -e ISL="${{ inputs.isl }}" \ + -e OSL="${{ inputs.osl }}" \ + -e PORT=8888 \ + -e RANDOM_RANGE_RATIO=0.8 \ + -e VLLM_EXTRA_ARGS="${{ inputs.extra-args }}" \ + -e SKIP_BASELINE="${{ inputs.skip-baseline && '1' || '0' }}" \ + -e SKIP_RECONFIG="${{ inputs.skip-reconfig && '1' || '0' }}" \ + -e RESULT_FILENAME="reconfigure_ab_test" \ + -e NCCL_GRAPH_REGISTER=0 \ + -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \ + "$IMAGE" \ + bash benchmarks/test_reconfigure_sweep.sh + + - name: Upload results + if: always() + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 + with: + name: reconfigure-ab-results-${{ inputs.runner }} + path: | + results_reconfigure_test/ + retention-days: 30 + + - name: Resource cleanup (post-run) + if: always() + run: | + if command -v docker >/dev/null 2>&1 && docker info >/dev/null 2>&1; then + docker ps -aq | xargs -r docker rm -f || true + fi diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 535313252..608261a06 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -9,6 +9,57 @@ export PYTHONDONTWRITEBYTECODE=1 export PYTHONPYCACHEPREFIX="${PYTHONPYCACHEPREFIX:-/tmp/inferencex-pycache}" mkdir -p "$PYTHONPYCACHEPREFIX" 2>/dev/null || true +# -------------------------------- +# vLLM dynamic scheduler reconfiguration +# -------------------------------- + +# Reconfigure vLLM scheduler limits on a running endpoint. This requires a vLLM +# build that exposes POST /pause, POST /reconfigure, and POST /resume. +# The feature is opt-in via VLLM_DYNAMIC_RECONFIGURE=1 and is intended for +# single-server sweeps where model, parallelism, and cache layout stay fixed. +# +# Supported env vars (set before calling): +# VLLM_MAX_NUM_BATCHED_TOKENS -- max tokens scheduled per step +# VLLM_MAX_NUM_SEQS -- max concurrent sequences +reconfigure_vllm_scheduler() { + local port="$1" + local base_url="${VLLM_DYNAMIC_RECONFIGURE_BASE_URL:-http://0.0.0.0:$port}" + + # Build JSON body from set env vars + local json="{" + local sep="" + if [[ -n "${VLLM_MAX_NUM_BATCHED_TOKENS:-}" ]]; then + json+="${sep}\"max_num_batched_tokens\":${VLLM_MAX_NUM_BATCHED_TOKENS}" + sep="," + fi + if [[ -n "${VLLM_MAX_NUM_SEQS:-}" ]]; then + json+="${sep}\"max_num_seqs\":${VLLM_MAX_NUM_SEQS}" + sep="," + fi + json+="}" + + if [[ "$json" == "{}" ]]; then + echo "VLLM_DYNAMIC_RECONFIGURE=1 but no VLLM scheduler parameters were set" + return 1 + fi + + echo "Reconfiguring vLLM scheduler at $base_url: $json" + curl -fsS -X POST "$base_url/pause?mode=keep&clear_cache=true" + + local rc=0 + curl -fsS -X POST "$base_url/reconfigure" \ + -H "Content-Type: application/json" \ + -d "$json" || rc=$? + + # Always resume so the server is never left paused on failure. + curl -fsS -X POST "$base_url/resume" + + if [[ "$rc" -ne 0 ]]; then + echo "ERROR: /reconfigure failed (curl exit code $rc)" >&2 + return "$rc" + fi +} + # -------------------------------- # GPU monitoring helpers # -------------------------------- @@ -326,6 +377,10 @@ run_benchmark_serving() { num_prompts="$max_concurrency" fi + if [[ "${VLLM_DYNAMIC_RECONFIGURE:-0}" == "1" && "$backend" == "vllm" ]]; then + reconfigure_vllm_scheduler "$port" + fi + # Build benchmark command local benchmark_cmd=( python3 "$workspace_dir/utils/bench_serving/benchmark_serving.py" @@ -805,3 +860,81 @@ run_eval() { fi return $eval_rc } + +# -------------------------------- +# Patched vLLM distribution helpers +# -------------------------------- + +# Install a patched vLLM build before launching `vllm serve`. This is optional +# and only needed when the active container/image does not already include the +# dynamic scheduler reconfiguration API. +# +# Supported env vars: +# VLLM_PATCHED_WHEEL -- local or remote wheel path/URL +# VLLM_PATCHED_REPO -- git repository URL for patched vLLM +# VLLM_PATCHED_REF -- branch, tag, or commit for VLLM_PATCHED_REPO +# VLLM_PATCHED_CHECKOUT -- existing mounted checkout to install editable +# VLLM_PATCHED_INSTALL_MODE -- wheel, git, editable, or auto (default) +install_patched_vllm() { + local mode="${VLLM_PATCHED_INSTALL_MODE:-auto}" + + if [[ "$mode" == "auto" ]]; then + if [[ -n "${VLLM_PATCHED_WHEEL:-}" ]]; then + mode="wheel" + elif [[ -n "${VLLM_PATCHED_CHECKOUT:-}" ]]; then + mode="editable" + elif [[ -n "${VLLM_PATCHED_REPO:-}" ]]; then + mode="git" + else + echo "No patched vLLM install source configured; using existing vLLM" + return 0 + fi + fi + + case "$mode" in + wheel) + if [[ -z "${VLLM_PATCHED_WHEEL:-}" ]]; then + echo "VLLM_PATCHED_INSTALL_MODE=wheel requires VLLM_PATCHED_WHEEL" + return 1 + fi + echo "Installing patched vLLM wheel: $VLLM_PATCHED_WHEEL" + python3 -m pip install --no-cache-dir --no-deps --force-reinstall "$VLLM_PATCHED_WHEEL" + ;; + git) + if [[ -z "${VLLM_PATCHED_REPO:-}" || -z "${VLLM_PATCHED_REF:-}" ]]; then + echo "VLLM_PATCHED_INSTALL_MODE=git requires VLLM_PATCHED_REPO and VLLM_PATCHED_REF" + return 1 + fi + local checkout_dir="${VLLM_PATCHED_GIT_DIR:-/tmp/patched-vllm}" + rm -rf "$checkout_dir" + git clone --depth 1 --branch "$VLLM_PATCHED_REF" "$VLLM_PATCHED_REPO" "$checkout_dir" || { + git clone "$VLLM_PATCHED_REPO" "$checkout_dir" + git -C "$checkout_dir" checkout "$VLLM_PATCHED_REF" + } + echo "Installing patched vLLM from $VLLM_PATCHED_REPO@$VLLM_PATCHED_REF" + VLLM_USE_PRECOMPILED=${VLLM_USE_PRECOMPILED:-1} \ + python3 -m pip install --no-cache-dir -e "$checkout_dir" + ;; + editable) + if [[ -z "${VLLM_PATCHED_CHECKOUT:-}" ]]; then + echo "VLLM_PATCHED_INSTALL_MODE=editable requires VLLM_PATCHED_CHECKOUT" + return 1 + fi + echo "Installing patched vLLM editable checkout: $VLLM_PATCHED_CHECKOUT" + VLLM_USE_PRECOMPILED=${VLLM_USE_PRECOMPILED:-1} \ + python3 -m pip install --no-cache-dir -e "$VLLM_PATCHED_CHECKOUT" + ;; + *) + echo "Unknown VLLM_PATCHED_INSTALL_MODE: $mode" + return 1 + ;; + esac + + python3 - <<'PY' +import importlib.metadata +try: + print("Installed vLLM version:", importlib.metadata.version("vllm")) +except importlib.metadata.PackageNotFoundError: + print("Installed vLLM version: unknown") +PY +} diff --git a/benchmarks/test_reconfigure_sweep.sh b/benchmarks/test_reconfigure_sweep.sh new file mode 100755 index 000000000..3a7d5f08d --- /dev/null +++ b/benchmarks/test_reconfigure_sweep.sh @@ -0,0 +1,204 @@ +#!/usr/bin/env bash +# A/B test: compare N cold starts (baseline) vs 1 start + N reconfigure cycles. +# +# Run this directly on a GPU node inside a vLLM container that includes the +# /reconfigure endpoint. Mount the inferencex workspace at /workspace. +# +# Required env vars: +# MODEL -- HuggingFace model path (e.g. openai/gpt-oss-120b) +# TP -- tensor-parallel size (e.g. 8) +# CONC -- benchmark concurrency (e.g. 32) +# ISL -- input sequence length (e.g. 1024) +# OSL -- output sequence length (e.g. 1024) +# +# Optional: +# PORT -- server port (default 8888) +# VLLM_EXTRA_ARGS -- extra args passed to vllm serve (e.g. "--kv-cache-dtype fp8") +# SKIP_BASELINE -- set to 1 to skip the baseline phase +# SKIP_RECONFIG -- set to 1 to skip the reconfigure phase +# +# Usage: +# export MODEL=openai/gpt-oss-120b TP=8 CONC=32 ISL=1024 OSL=1024 +# bash benchmarks/test_reconfigure_sweep.sh +set -euo pipefail + +source "$(dirname "$0")/benchmark_lib.sh" + +check_env_vars MODEL TP CONC ISL OSL + +PORT=${PORT:-8888} +MAX_MODEL_LEN=$(( ISL + OSL + 256 )) +NUM_PROMPTS=$(( CONC * 10 )) + +# Parameter grid to sweep +MNB_VALUES=(4096 8192 16384) +MNS_VALUES=(256 512) +GRID_SIZE=$(( ${#MNB_VALUES[@]} * ${#MNS_VALUES[@]} )) + +RESULTS_BASE=/workspace/results_reconfigure_test +RESULTS_A="${RESULTS_BASE}/baseline" +RESULTS_B="${RESULTS_BASE}/reconfig" +mkdir -p "$RESULTS_A" "$RESULTS_B" + +SERVER_LOG_DIR="${RESULTS_BASE}/logs" +mkdir -p "$SERVER_LOG_DIR" + +start_server() { + local mnb="$1" mns="$2" log="$3" + + vllm serve "$MODEL" \ + --host 0.0.0.0 --port "$PORT" \ + --tensor-parallel-size "$TP" \ + --gpu-memory-utilization 0.9 \ + --max-model-len "$MAX_MODEL_LEN" \ + --max-num-batched-tokens "$mnb" \ + --max-num-seqs "$mns" \ + --no-enable-prefix-caching \ + --disable-log-requests \ + ${VLLM_EXTRA_ARGS:-} \ + > "$log" 2>&1 & + SERVER_PID=$! + + wait_for_server_ready \ + --port "$PORT" \ + --server-log "$log" \ + --server-pid "$SERVER_PID" +} + +kill_server() { + if [[ -n "${SERVER_PID:-}" ]]; then + kill "$SERVER_PID" 2>/dev/null || true + wait "$SERVER_PID" 2>/dev/null || true + unset SERVER_PID + sleep 2 + fi +} + +run_bench() { + local result_dir="$1" result_name="$2" + + run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio 0.8 \ + --num-prompts "$NUM_PROMPTS" \ + --max-concurrency "$CONC" \ + --result-filename "$result_name" \ + --result-dir "$result_dir" \ + --server-pid "$SERVER_PID" +} + +trap kill_server EXIT + +pip install -q datasets pandas 2>/dev/null || true + +# ────────────────────────────────────────────── +# Phase A: Baseline — separate server per config +# ────────────────────────────────────────────── +if [[ "${SKIP_BASELINE:-0}" != "1" ]]; then + echo "" + echo "###############################################" + echo "# Phase A: Baseline (${GRID_SIZE} cold starts)" + echo "###############################################" + + A_START=$(date +%s) + A_RUN=0 + + for mnb in "${MNB_VALUES[@]}"; do + for mns in "${MNS_VALUES[@]}"; do + A_RUN=$((A_RUN + 1)) + echo "" + echo "--- A.$A_RUN: max_num_batched_tokens=$mnb max_num_seqs=$mns ---" + + RUN_START=$(date +%s) + start_server "$mnb" "$mns" "${SERVER_LOG_DIR}/server_a_${A_RUN}.log" + READY_TIME=$(date +%s) + echo " Startup: $((READY_TIME - RUN_START))s" + + run_bench "$RESULTS_A" "baseline_mnb${mnb}_mns${mns}" + + kill_server + RUN_END=$(date +%s) + echo " Total: $((RUN_END - RUN_START))s" + done + done + + A_END=$(date +%s) + A_TOTAL=$((A_END - A_START)) + echo "" + echo "Phase A total: ${A_TOTAL}s" +else + A_TOTAL="(skipped)" +fi + +# ────────────────────────────────────────────── +# Phase B: Reconfigure — single server, N cycles +# ────────────────────────────────────────────── +if [[ "${SKIP_RECONFIG:-0}" != "1" ]]; then + echo "" + echo "###############################################" + echo "# Phase B: Reconfigure (1 cold start)" + echo "###############################################" + + B_START=$(date +%s) + + # Start with the largest values so CUDA graphs and KV cache cover all configs + INIT_MNB=${MNB_VALUES[-1]} + INIT_MNS=${MNS_VALUES[-1]} + + echo "" + echo "--- Starting server (mnb=$INIT_MNB mns=$INIT_MNS) ---" + STARTUP_START=$(date +%s) + start_server "$INIT_MNB" "$INIT_MNS" "${SERVER_LOG_DIR}/server_b.log" + STARTUP_END=$(date +%s) + echo " Startup: $((STARTUP_END - STARTUP_START))s" + + B_RUN=0 + for mnb in "${MNB_VALUES[@]}"; do + for mns in "${MNS_VALUES[@]}"; do + B_RUN=$((B_RUN + 1)) + echo "" + echo "--- B.$B_RUN: max_num_batched_tokens=$mnb max_num_seqs=$mns ---" + + RECONF_START=$(date +%s) + + # Set env vars — run_benchmark_serving picks these up via + # the VLLM_DYNAMIC_RECONFIGURE hook and calls + # reconfigure_vllm_scheduler automatically. + export VLLM_DYNAMIC_RECONFIGURE=1 + export VLLM_MAX_NUM_BATCHED_TOKENS="$mnb" + export VLLM_MAX_NUM_SEQS="$mns" + + run_bench "$RESULTS_B" "reconfig_mnb${mnb}_mns${mns}" + + RUN_END=$(date +%s) + echo " Total: $((RUN_END - RECONF_START))s" + done + done + + kill_server + + B_END=$(date +%s) + B_TOTAL=$((B_END - B_START)) + B_STARTUP=$((STARTUP_END - STARTUP_START)) + echo "" + echo "Phase B total: ${B_TOTAL}s (startup: ${B_STARTUP}s)" +else + B_TOTAL="(skipped)" +fi + +# ────────────────────────────────────────────── +# Summary +# ────────────────────────────────────────────── +echo "" +echo "==============================================" +echo " Comparison" +echo "==============================================" +echo " Phase A (baseline, ${GRID_SIZE} cold starts): ${A_TOTAL}s" +echo " Phase B (reconfigure, 1 cold start): ${B_TOTAL}s" +echo "" +echo " Results saved to: ${RESULTS_BASE}/" +echo "==============================================" diff --git a/docs/vllm-dynamic-scheduler-reconfigure.md b/docs/vllm-dynamic-scheduler-reconfigure.md new file mode 100644 index 000000000..8d80a3d49 --- /dev/null +++ b/docs/vllm-dynamic-scheduler-reconfigure.md @@ -0,0 +1,109 @@ +# vLLM Dynamic Scheduler Reconfiguration + +InferenceX can optionally reconfigure selected vLLM scheduler limits between +benchmark runs without restarting the serving endpoint. This is useful for +single-server sweeps where the model, parallelism, quantization, max context, +and KV cache layout stay fixed, but each benchmark case uses different scheduler +admission limits. + +## Requirements + +This feature requires a vLLM build that exposes these HTTP endpoints: + +- `POST /pause?mode=keep&clear_cache=true` +- `POST /reconfigure` (JSON body) +- `POST /resume` + +The stock vLLM releases do not provide `/reconfigure`. You need either: + +1. A Docker image built from the vLLM branch containing the reconfigure API + (see [Building the patched image](#building-the-patched-image) below). +2. A runtime patch applied at container start from a mounted vLLM checkout. + +## Reconfigurable Parameters + +| Parameter | Type | Description | +|-----------|------|-------------| +| `max_num_batched_tokens` | int > 0 | Max tokens scheduled per step | +| `max_num_seqs` | int > 0 | Max concurrent sequences | +| `enable_chunked_prefill` | bool | Toggle chunked prefill | +| `long_prefill_token_threshold` | int >= 0 | Cap prefill chunk size (0 = no cap) | + +Everything else (TP, EP, GPU memory, KV cache dtype, block size, CUDA graphs, +compilation config, etc.) is baked in at startup and cannot be changed. + +## Enabling + +Set `VLLM_DYNAMIC_RECONFIGURE=1` and the desired parameter env vars before +calling `run_benchmark_serving` with `--backend vllm`: + +```bash +export VLLM_DYNAMIC_RECONFIGURE=1 +export VLLM_MAX_NUM_BATCHED_TOKENS=32768 +export VLLM_MAX_NUM_SEQS=128 +``` + +`run_benchmark_serving` calls `reconfigure_vllm_scheduler` before each benchmark +run. The helper pauses vLLM, sends a JSON body to `/reconfigure`, and resumes. + +## A/B Test Script + +`benchmarks/test_reconfigure_sweep.sh` runs back-to-back comparisons: + +- **Phase A (baseline):** N cold starts, one per parameter combo +- **Phase B (reconfigure):** 1 cold start, N reconfigure cycles + +```bash +export MODEL=openai/gpt-oss-120b TP=8 CONC=32 ISL=1024 OSL=1024 +bash benchmarks/test_reconfigure_sweep.sh +``` + +## Pre-built Image + +A pre-built image is available on Docker Hub: + + semianalysiswork/vllm-reconfigure:latest + +This overlays the reconfigure API onto `vllm/vllm-openai:v0.18.0`. +Source: [JordanNanos/vllm `feature/reconfigure-scheduler`](https://github.com/JordanNanos/vllm/tree/feature/reconfigure-scheduler), +Dockerfile: `docker/Dockerfile.single-node-nvidia`. + +## Running a Single-Node Test + +```bash +docker run --rm --init --network host \ + --runtime nvidia --gpus all --ipc host --privileged \ + --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \ + -v $HF_HUB_CACHE:/root/.cache/huggingface \ + -v $(pwd):/workspace -w /workspace \ + -e HF_TOKEN -e PORT=8888 \ + -e MODEL=openai/gpt-oss-120b \ + -e TP=8 -e CONC=32 \ + -e ISL=1024 -e OSL=1024 \ + semianalysiswork/vllm-reconfigure:latest \ + bash benchmarks/test_reconfigure_sweep.sh +``` + +Use `SKIP_BASELINE=1` or `SKIP_RECONFIG=1` to run only one phase. +Pass extra vLLM flags via `VLLM_EXTRA_ARGS` (e.g. `--kv-cache-dtype fp8`). + +## Building the Image From Source + +To rebuild from the vLLM fork: + +```bash +git clone https://github.com/JordanNanos/vllm.git -b feature/reconfigure-scheduler +cd vllm +docker build --platform linux/amd64 \ + -f docker/Dockerfile.single-node-nvidia \ + --build-arg BASE_IMAGE=vllm/vllm-openai:v0.18.0 \ + -t semianalysiswork/vllm-reconfigure:latest . +``` + +## Safety Notes + +Do not use this mechanism to change startup-time engine settings such as model, +quantization, tensor/data/expert parallelism, KV cache dtype, block size, +`gpu_memory_utilization`, or `max_model_len`. Launch vLLM with the largest static +capacity required by the sweep and use dynamic reconfiguration only for scheduler +limits. diff --git a/docs/vllm-patched-distribution.md b/docs/vllm-patched-distribution.md new file mode 100644 index 000000000..ab94f4245 --- /dev/null +++ b/docs/vllm-patched-distribution.md @@ -0,0 +1,94 @@ +# Distributing Patched vLLM for Dynamic Reconfiguration + +Dynamic scheduler reconfiguration requires a vLLM build that includes the +runtime reconfiguration API used by InferenceX: + +- `POST /pause` +- `POST /reconfigure` +- `POST /resume` + +A stock vLLM image or wheel that does not include this API can still run normal +InferenceX benchmarks. It only needs the patched build when +`VLLM_DYNAMIC_RECONFIGURE=1` is enabled. + +## Option 1: Custom Benchmark Container + +Build a container that starts from the normal InferenceX/vLLM benchmark image and +installs the patched vLLM branch or wheel at image build time. This is the most +reproducible option for cluster sweeps. + +Example Dockerfile pattern: + +```Dockerfile +FROM vllm/vllm-openai: + +ARG VLLM_REPO=https://github.com//vllm.git +ARG VLLM_REF= + +RUN git clone "$VLLM_REPO" /opt/patched-vllm \ + && cd /opt/patched-vllm \ + && git checkout "$VLLM_REF" \ + && VLLM_USE_PRECOMPILED=1 python3 -m pip install --no-cache-dir -e . + +LABEL org.opencontainers.image.source-vllm="$VLLM_REPO" +LABEL org.opencontainers.image.revision-vllm="$VLLM_REF" +``` + +Then update the InferenceX config entry to use that image for the vLLM run. + +## Option 2: Install a Pinned Wheel Before `vllm serve` + +Build a patched wheel once, publish it to an artifact store, and install it in +the benchmark job before launching `vllm serve`. + +```bash +export VLLM_PATCHED_INSTALL_MODE=wheel +export VLLM_PATCHED_WHEEL=/workspace/wheels/vllm--patched.whl +install_patched_vllm +``` + +Remote wheel URLs also work if the runner has access: + +```bash +export VLLM_PATCHED_WHEEL=https://example.internal/wheels/vllm-patched.whl +install_patched_vllm +``` + +## Option 3: Mount a Patched Checkout and Install Editable + +This is useful for fast experiments, but less reproducible than an image or +wheel. Mount the patched vLLM checkout into the job and run: + +```bash +export VLLM_PATCHED_INSTALL_MODE=editable +export VLLM_PATCHED_CHECKOUT=/workspace/vllm-patched +install_patched_vllm +``` + +## Option 4: Clone and Install a Pinned Git Ref + +This is convenient when the cluster has network access to the patched branch: + +```bash +export VLLM_PATCHED_INSTALL_MODE=git +export VLLM_PATCHED_REPO=https://github.com//vllm.git +export VLLM_PATCHED_REF= +install_patched_vllm +``` + +For reproducible benchmark results, prefer a commit SHA over a mutable branch. + +## Recommended Cluster Flow + +1. Start from a custom image or call `install_patched_vllm` before `vllm serve`. +2. Launch vLLM with the largest static capacity needed by the sweep. +3. Enable scheduler reconfiguration for the sweep: + +```bash +export VLLM_DYNAMIC_RECONFIGURE=1 +export VLLM_MAX_NUM_BATCHED_TOKENS=32768 +export VLLM_MAX_NUM_SEQS="$CONC" +``` + +4. Run `run_benchmark_serving` as usual. +5. Record the patched vLLM commit SHA or wheel URL in the run notes/results.