From dd1d58d2b759fbb3cf49868d642b0c9ab4895170 Mon Sep 17 00:00:00 2001 From: Jordan Nanos Date: Tue, 14 Apr 2026 14:38:47 -0700 Subject: [PATCH 1/8] Add vLLM dynamic scheduler reconfigure hook --- benchmarks/benchmark_lib.sh | 35 ++++++++ docs/vllm-dynamic-scheduler-reconfigure.md | 95 ++++++++++++++++++++++ 2 files changed, 130 insertions(+) create mode 100644 docs/vllm-dynamic-scheduler-reconfigure.md diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 535313252..d56da14aa 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -9,6 +9,37 @@ export PYTHONDONTWRITEBYTECODE=1 export PYTHONPYCACHEPREFIX="${PYTHONPYCACHEPREFIX:-/tmp/inferencex-pycache}" mkdir -p "$PYTHONPYCACHEPREFIX" 2>/dev/null || true +# -------------------------------- +# vLLM dynamic scheduler reconfiguration +# -------------------------------- + +# Reconfigure vLLM scheduler limits on a running endpoint. This requires a vLLM +# build that exposes POST /pause, POST /reconfigure_scheduler, and POST /resume. +# The feature is opt-in via VLLM_DYNAMIC_RECONFIGURE=1 and is intended for +# single-server sweeps where model, parallelism, and cache layout stay fixed. +reconfigure_vllm_scheduler() { + local port="$1" + local base_url="${VLLM_DYNAMIC_RECONFIGURE_BASE_URL:-http://0.0.0.0:$port}" + local params=() + + [[ -n "${VLLM_MAX_NUM_BATCHED_TOKENS:-}" ]] && \ + params+=(--data-urlencode "max_num_batched_tokens=$VLLM_MAX_NUM_BATCHED_TOKENS") + [[ -n "${VLLM_MAX_NUM_SEQS:-}" ]] && \ + params+=(--data-urlencode "max_num_seqs=$VLLM_MAX_NUM_SEQS") + [[ -n "${VLLM_MAX_NUM_SCHEDULED_TOKENS:-}" ]] && \ + params+=(--data-urlencode "max_num_scheduled_tokens=$VLLM_MAX_NUM_SCHEDULED_TOKENS") + + if [[ ${#params[@]} -eq 0 ]]; then + echo "VLLM_DYNAMIC_RECONFIGURE=1 but no VLLM scheduler parameters were set" + return 1 + fi + + echo "Reconfiguring vLLM scheduler at $base_url" + curl -fsS -X POST "$base_url/pause?mode=keep" + curl -fsS -X POST -G "$base_url/reconfigure_scheduler" "${params[@]}" + curl -fsS -X POST "$base_url/resume" +} + # -------------------------------- # GPU monitoring helpers # -------------------------------- @@ -326,6 +357,10 @@ run_benchmark_serving() { num_prompts="$max_concurrency" fi + if [[ "${VLLM_DYNAMIC_RECONFIGURE:-0}" == "1" && "$backend" == "vllm" ]]; then + reconfigure_vllm_scheduler "$port" + fi + # Build benchmark command local benchmark_cmd=( python3 "$workspace_dir/utils/bench_serving/benchmark_serving.py" diff --git a/docs/vllm-dynamic-scheduler-reconfigure.md b/docs/vllm-dynamic-scheduler-reconfigure.md new file mode 100644 index 000000000..04062037b --- /dev/null +++ b/docs/vllm-dynamic-scheduler-reconfigure.md @@ -0,0 +1,95 @@ +# vLLM Dynamic Scheduler Reconfiguration + +InferenceX can optionally reconfigure selected vLLM scheduler limits between +benchmark runs without restarting the serving endpoint. This is useful for +single-server sweeps where the model, parallelism, quantization, max context, +and KV cache layout stay fixed, but each benchmark case uses different scheduler +admission limits. + +## Requirements + +This feature requires a vLLM build that exposes these HTTP endpoints: + +- `POST /pause?mode=keep` +- `POST /reconfigure_scheduler` +- `POST /resume` + +The stock vLLM releases do not provide `/reconfigure_scheduler` unless the +runtime scheduler reconfiguration patch has been included in the installed vLLM +package or container image. + +## Enabling + +Set `VLLM_DYNAMIC_RECONFIGURE=1` before calling `run_benchmark_serving` with +`--backend vllm`. + +Supported environment variables: + +```bash +export VLLM_DYNAMIC_RECONFIGURE=1 +export VLLM_MAX_NUM_BATCHED_TOKENS=32768 +export VLLM_MAX_NUM_SEQS=128 +export VLLM_MAX_NUM_SCHEDULED_TOKENS=32768 +``` + +`run_benchmark_serving` calls the reconfiguration helper before each benchmark +run. The helper pauses vLLM, applies the requested scheduler limits, and resumes +serving. + +## Example Sweep + +Launch vLLM once with the largest static capacity needed by the sweep, then vary +scheduler limits between benchmark cases: + +```bash +vllm serve "$MODEL" \ + --host 0.0.0.0 \ + --port "$PORT" \ + --tensor-parallel-size "$TP" \ + --max-num-seqs 256 \ + --max-num-batched-tokens 32768 \ + > "$SERVER_LOG" 2>&1 & + +SERVER_PID=$! +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +for conc in 1 2 4 8 16 32 64 128; do + export VLLM_DYNAMIC_RECONFIGURE=1 + export VLLM_MAX_NUM_SEQS="$conc" + export VLLM_MAX_NUM_BATCHED_TOKENS=32768 + export VLLM_MAX_NUM_SCHEDULED_TOKENS=32768 + + run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$((conc * 10))" \ + --max-concurrency "$conc" \ + --result-filename "${RESULT_FILENAME}_conc${conc}" \ + --result-dir /workspace/ \ + --server-pid "$SERVER_PID" +done +``` + +## Distribution of the vLLM Patch + +Cluster runs must use a vLLM package or image that includes the dynamic scheduler +API. Practical options are: + +1. Build a custom benchmark container from the vLLM branch that contains the API. +2. Install the patched vLLM wheel in the InferenceX job before starting `vllm serve`. +3. Mount a patched vLLM checkout and install it editable in the benchmark image. + +For reproducible cluster results, prefer a custom container or pinned wheel and +record the vLLM commit SHA in the benchmark metadata. + +## Safety Notes + +Do not use this mechanism to change startup-time engine settings such as model, +quantization, tensor/data/expert parallelism, KV cache dtype, block size, +`gpu_memory_utilization`, or `max_model_len`. Launch vLLM with the largest static +capacity required by the sweep and use dynamic reconfiguration only for scheduler +limits. From 51f3290d7cfe3516e59c58dbc3db858d7b6b8f28 Mon Sep 17 00:00:00 2001 From: Jordan Nanos Date: Tue, 14 Apr 2026 14:40:13 -0700 Subject: [PATCH 2/8] Fix reconfigure helper to match vLLM /reconfigure API and add A/B test script - Fix reconfigure_vllm_scheduler() to use POST /reconfigure with a JSON body instead of query params to the non-existent /reconfigure_scheduler - Remove max_num_scheduled_tokens (internal name, not exposed by API) - Use mode=abort&clear_cache=true on /pause for clean reconfigure cycles - Add benchmarks/test_reconfigure_sweep.sh for standalone A/B testing on a cluster: runs N cold starts (baseline) vs 1 start + N reconfigure cycles and prints wall-clock comparison - Update docs to match actual API surface Co-Authored-By: Claude Opus 4.6 (1M context) --- benchmarks/benchmark_lib.sh | 35 ++-- benchmarks/test_reconfigure_sweep.sh | 205 +++++++++++++++++++++ docs/vllm-dynamic-scheduler-reconfigure.md | 108 +++++------ 3 files changed, 282 insertions(+), 66 deletions(-) create mode 100755 benchmarks/test_reconfigure_sweep.sh diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index d56da14aa..5754b87f1 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -14,29 +14,40 @@ mkdir -p "$PYTHONPYCACHEPREFIX" 2>/dev/null || true # -------------------------------- # Reconfigure vLLM scheduler limits on a running endpoint. This requires a vLLM -# build that exposes POST /pause, POST /reconfigure_scheduler, and POST /resume. +# build that exposes POST /pause, POST /reconfigure, and POST /resume. # The feature is opt-in via VLLM_DYNAMIC_RECONFIGURE=1 and is intended for # single-server sweeps where model, parallelism, and cache layout stay fixed. +# +# Supported env vars (set before calling): +# VLLM_MAX_NUM_BATCHED_TOKENS -- max tokens scheduled per step +# VLLM_MAX_NUM_SEQS -- max concurrent sequences reconfigure_vllm_scheduler() { local port="$1" local base_url="${VLLM_DYNAMIC_RECONFIGURE_BASE_URL:-http://0.0.0.0:$port}" - local params=() - [[ -n "${VLLM_MAX_NUM_BATCHED_TOKENS:-}" ]] && \ - params+=(--data-urlencode "max_num_batched_tokens=$VLLM_MAX_NUM_BATCHED_TOKENS") - [[ -n "${VLLM_MAX_NUM_SEQS:-}" ]] && \ - params+=(--data-urlencode "max_num_seqs=$VLLM_MAX_NUM_SEQS") - [[ -n "${VLLM_MAX_NUM_SCHEDULED_TOKENS:-}" ]] && \ - params+=(--data-urlencode "max_num_scheduled_tokens=$VLLM_MAX_NUM_SCHEDULED_TOKENS") + # Build JSON body from set env vars + local json="{" + local sep="" + if [[ -n "${VLLM_MAX_NUM_BATCHED_TOKENS:-}" ]]; then + json+="${sep}\"max_num_batched_tokens\":${VLLM_MAX_NUM_BATCHED_TOKENS}" + sep="," + fi + if [[ -n "${VLLM_MAX_NUM_SEQS:-}" ]]; then + json+="${sep}\"max_num_seqs\":${VLLM_MAX_NUM_SEQS}" + sep="," + fi + json+="}" - if [[ ${#params[@]} -eq 0 ]]; then + if [[ "$json" == "{}" ]]; then echo "VLLM_DYNAMIC_RECONFIGURE=1 but no VLLM scheduler parameters were set" return 1 fi - echo "Reconfiguring vLLM scheduler at $base_url" - curl -fsS -X POST "$base_url/pause?mode=keep" - curl -fsS -X POST -G "$base_url/reconfigure_scheduler" "${params[@]}" + echo "Reconfiguring vLLM scheduler at $base_url: $json" + curl -fsS -X POST "$base_url/pause?mode=abort&clear_cache=true" + curl -fsS -X POST "$base_url/reconfigure" \ + -H "Content-Type: application/json" \ + -d "$json" curl -fsS -X POST "$base_url/resume" } diff --git a/benchmarks/test_reconfigure_sweep.sh b/benchmarks/test_reconfigure_sweep.sh new file mode 100755 index 000000000..815db2355 --- /dev/null +++ b/benchmarks/test_reconfigure_sweep.sh @@ -0,0 +1,205 @@ +#!/usr/bin/env bash +# A/B test: compare N cold starts (baseline) vs 1 start + N reconfigure cycles. +# +# Run this directly on a GPU node inside a vLLM container that includes the +# /reconfigure endpoint. Mount the inferencex workspace at /workspace. +# +# Required env vars: +# MODEL -- HuggingFace model path (e.g. openai/gpt-oss-120b) +# TP -- tensor-parallel size (e.g. 8) +# CONC -- benchmark concurrency (e.g. 32) +# ISL -- input sequence length (e.g. 1024) +# OSL -- output sequence length (e.g. 1024) +# +# Optional: +# PORT -- server port (default 8888) +# VLLM_EXTRA_ARGS -- extra args passed to vllm serve (e.g. "--kv-cache-dtype fp8") +# SKIP_BASELINE -- set to 1 to skip the baseline phase +# SKIP_RECONFIG -- set to 1 to skip the reconfigure phase +# +# Usage: +# export MODEL=openai/gpt-oss-120b TP=8 CONC=32 ISL=1024 OSL=1024 +# bash benchmarks/test_reconfigure_sweep.sh +set -euo pipefail + +source "$(dirname "$0")/benchmark_lib.sh" + +check_env_vars MODEL TP CONC ISL OSL + +PORT=${PORT:-8888} +MAX_MODEL_LEN=$(( ISL + OSL + 256 )) +NUM_PROMPTS=$(( CONC * 10 )) + +# Parameter grid to sweep +MNB_VALUES=(4096 8192 16384) +MNS_VALUES=(256 512) +GRID_SIZE=$(( ${#MNB_VALUES[@]} * ${#MNS_VALUES[@]} )) + +RESULTS_BASE=/workspace/results_reconfigure_test +RESULTS_A="${RESULTS_BASE}/baseline" +RESULTS_B="${RESULTS_BASE}/reconfig" +mkdir -p "$RESULTS_A" "$RESULTS_B" + +SERVER_LOG_DIR="${RESULTS_BASE}/logs" +mkdir -p "$SERVER_LOG_DIR" + +start_server() { + local mnb="$1" mns="$2" log="$3" + + vllm serve "$MODEL" \ + --host 0.0.0.0 --port "$PORT" \ + --tensor-parallel-size "$TP" \ + --gpu-memory-utilization 0.9 \ + --max-model-len "$MAX_MODEL_LEN" \ + --max-num-batched-tokens "$mnb" \ + --max-num-seqs "$mns" \ + --no-enable-prefix-caching \ + --disable-log-requests \ + ${VLLM_EXTRA_ARGS:-} \ + > "$log" 2>&1 & + SERVER_PID=$! + + wait_for_server_ready \ + --port "$PORT" \ + --server-log "$log" \ + --server-pid "$SERVER_PID" +} + +kill_server() { + if [[ -n "${SERVER_PID:-}" ]]; then + kill "$SERVER_PID" 2>/dev/null || true + wait "$SERVER_PID" 2>/dev/null || true + unset SERVER_PID + sleep 2 + fi +} + +run_bench() { + local result_dir="$1" result_name="$2" + + run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio 0.8 \ + --num-prompts "$NUM_PROMPTS" \ + --max-concurrency "$CONC" \ + --result-filename "$result_name" \ + --result-dir "$result_dir" \ + --server-pid "$SERVER_PID" +} + +trap kill_server EXIT + +pip install -q datasets pandas 2>/dev/null || true + +# ────────────────────────────────────────────── +# Phase A: Baseline — separate server per config +# ────────────────────────────────────────────── +if [[ "${SKIP_BASELINE:-0}" != "1" ]]; then + echo "" + echo "###############################################" + echo "# Phase A: Baseline (${GRID_SIZE} cold starts)" + echo "###############################################" + + A_START=$(date +%s) + A_RUN=0 + + for mnb in "${MNB_VALUES[@]}"; do + for mns in "${MNS_VALUES[@]}"; do + A_RUN=$((A_RUN + 1)) + echo "" + echo "--- A.$A_RUN: max_num_batched_tokens=$mnb max_num_seqs=$mns ---" + + RUN_START=$(date +%s) + start_server "$mnb" "$mns" "${SERVER_LOG_DIR}/server_a_${A_RUN}.log" + READY_TIME=$(date +%s) + echo " Startup: $((READY_TIME - RUN_START))s" + + run_bench "$RESULTS_A" "baseline_mnb${mnb}_mns${mns}" + + kill_server + RUN_END=$(date +%s) + echo " Total: $((RUN_END - RUN_START))s" + done + done + + A_END=$(date +%s) + A_TOTAL=$((A_END - A_START)) + echo "" + echo "Phase A total: ${A_TOTAL}s" +else + A_TOTAL="(skipped)" +fi + +# ────────────────────────────────────────────── +# Phase B: Reconfigure — single server, N cycles +# ────────────────────────────────────────────── +if [[ "${SKIP_RECONFIG:-0}" != "1" ]]; then + echo "" + echo "###############################################" + echo "# Phase B: Reconfigure (1 cold start)" + echo "###############################################" + + B_START=$(date +%s) + + # Start with the largest values so CUDA graphs and KV cache cover all configs + INIT_MNB=${MNB_VALUES[-1]} + INIT_MNS=${MNS_VALUES[-1]} + + echo "" + echo "--- Starting server (mnb=$INIT_MNB mns=$INIT_MNS) ---" + STARTUP_START=$(date +%s) + start_server "$INIT_MNB" "$INIT_MNS" "${SERVER_LOG_DIR}/server_b.log" + STARTUP_END=$(date +%s) + echo " Startup: $((STARTUP_END - STARTUP_START))s" + + B_RUN=0 + for mnb in "${MNB_VALUES[@]}"; do + for mns in "${MNS_VALUES[@]}"; do + B_RUN=$((B_RUN + 1)) + echo "" + echo "--- B.$B_RUN: max_num_batched_tokens=$mnb max_num_seqs=$mns ---" + + RECONF_START=$(date +%s) + + export VLLM_DYNAMIC_RECONFIGURE=1 + export VLLM_MAX_NUM_BATCHED_TOKENS="$mnb" + export VLLM_MAX_NUM_SEQS="$mns" + reconfigure_vllm_scheduler "$PORT" + + RECONF_END=$(date +%s) + echo " Reconfigure: $((RECONF_END - RECONF_START))s" + + run_bench "$RESULTS_B" "reconfig_mnb${mnb}_mns${mns}" + + RUN_END=$(date +%s) + echo " Total: $((RUN_END - RECONF_START))s" + done + done + + kill_server + + B_END=$(date +%s) + B_TOTAL=$((B_END - B_START)) + B_STARTUP=$((STARTUP_END - STARTUP_START)) + echo "" + echo "Phase B total: ${B_TOTAL}s (startup: ${B_STARTUP}s)" +else + B_TOTAL="(skipped)" +fi + +# ────────────────────────────────────────────── +# Summary +# ────────────────────────────────────────────── +echo "" +echo "==============================================" +echo " Comparison" +echo "==============================================" +echo " Phase A (baseline, ${GRID_SIZE} cold starts): ${A_TOTAL}s" +echo " Phase B (reconfigure, 1 cold start): ${B_TOTAL}s" +echo "" +echo " Results saved to: ${RESULTS_BASE}/" +echo "==============================================" diff --git a/docs/vllm-dynamic-scheduler-reconfigure.md b/docs/vllm-dynamic-scheduler-reconfigure.md index 04062037b..7db5c7b07 100644 --- a/docs/vllm-dynamic-scheduler-reconfigure.md +++ b/docs/vllm-dynamic-scheduler-reconfigure.md @@ -10,81 +10,81 @@ admission limits. This feature requires a vLLM build that exposes these HTTP endpoints: -- `POST /pause?mode=keep` -- `POST /reconfigure_scheduler` +- `POST /pause?mode=abort&clear_cache=true` +- `POST /reconfigure` (JSON body) - `POST /resume` -The stock vLLM releases do not provide `/reconfigure_scheduler` unless the -runtime scheduler reconfiguration patch has been included in the installed vLLM -package or container image. +The stock vLLM releases do not provide `/reconfigure`. You need either: -## Enabling +1. A Docker image built from the vLLM branch containing the reconfigure API + (see [Building the patched image](#building-the-patched-image) below). +2. A runtime patch applied at container start from a mounted vLLM checkout. + +## Reconfigurable Parameters + +| Parameter | Type | Description | +|-----------|------|-------------| +| `max_num_batched_tokens` | int > 0 | Max tokens scheduled per step | +| `max_num_seqs` | int > 0 | Max concurrent sequences | +| `enable_chunked_prefill` | bool | Toggle chunked prefill | +| `long_prefill_token_threshold` | int >= 0 | Cap prefill chunk size (0 = no cap) | -Set `VLLM_DYNAMIC_RECONFIGURE=1` before calling `run_benchmark_serving` with -`--backend vllm`. +Everything else (TP, EP, GPU memory, KV cache dtype, block size, CUDA graphs, +compilation config, etc.) is baked in at startup and cannot be changed. -Supported environment variables: +## Enabling + +Set `VLLM_DYNAMIC_RECONFIGURE=1` and the desired parameter env vars before +calling `run_benchmark_serving` with `--backend vllm`: ```bash export VLLM_DYNAMIC_RECONFIGURE=1 export VLLM_MAX_NUM_BATCHED_TOKENS=32768 export VLLM_MAX_NUM_SEQS=128 -export VLLM_MAX_NUM_SCHEDULED_TOKENS=32768 ``` -`run_benchmark_serving` calls the reconfiguration helper before each benchmark -run. The helper pauses vLLM, applies the requested scheduler limits, and resumes -serving. +`run_benchmark_serving` calls `reconfigure_vllm_scheduler` before each benchmark +run. The helper pauses vLLM, sends a JSON body to `/reconfigure`, and resumes. + +## A/B Test Script -## Example Sweep +`benchmarks/test_reconfigure_sweep.sh` runs back-to-back comparisons: -Launch vLLM once with the largest static capacity needed by the sweep, then vary -scheduler limits between benchmark cases: +- **Phase A (baseline):** N cold starts, one per parameter combo +- **Phase B (reconfigure):** 1 cold start, N reconfigure cycles ```bash -vllm serve "$MODEL" \ - --host 0.0.0.0 \ - --port "$PORT" \ - --tensor-parallel-size "$TP" \ - --max-num-seqs 256 \ - --max-num-batched-tokens 32768 \ - > "$SERVER_LOG" 2>&1 & - -SERVER_PID=$! -wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" - -for conc in 1 2 4 8 16 32 64 128; do - export VLLM_DYNAMIC_RECONFIGURE=1 - export VLLM_MAX_NUM_SEQS="$conc" - export VLLM_MAX_NUM_BATCHED_TOKENS=32768 - export VLLM_MAX_NUM_SCHEDULED_TOKENS=32768 - - run_benchmark_serving \ - --model "$MODEL" \ - --port "$PORT" \ - --backend vllm \ - --input-len "$ISL" \ - --output-len "$OSL" \ - --random-range-ratio "$RANDOM_RANGE_RATIO" \ - --num-prompts "$((conc * 10))" \ - --max-concurrency "$conc" \ - --result-filename "${RESULT_FILENAME}_conc${conc}" \ - --result-dir /workspace/ \ - --server-pid "$SERVER_PID" -done +export MODEL=openai/gpt-oss-120b TP=8 CONC=32 ISL=1024 OSL=1024 +bash benchmarks/test_reconfigure_sweep.sh ``` -## Distribution of the vLLM Patch +## Building the Patched Image -Cluster runs must use a vLLM package or image that includes the dynamic scheduler -API. Practical options are: +The changes are pure Python -- no C++/CUDA recompilation needed. Overlay them +onto the stock vLLM image: -1. Build a custom benchmark container from the vLLM branch that contains the API. -2. Install the patched vLLM wheel in the InferenceX job before starting `vllm serve`. -3. Mount a patched vLLM checkout and install it editable in the benchmark image. +```bash +# From the vllm repo root (on the branch with the reconfigure patch) +docker build -f docker/Dockerfile.reconfigure-overlay \ + -t ghcr.io/semianalysisai/vllm-reconfigure:test1 . + +docker push ghcr.io/semianalysisai/vllm-reconfigure:test1 +``` -For reproducible cluster results, prefer a custom container or pinned wheel and -record the vLLM commit SHA in the benchmark metadata. +Or patch at runtime by mounting the vLLM checkout and running the overlay script +at the top of the benchmark: + +```bash +docker run --gpus all --rm -it --network host --shm-size 64g \ + -v /path/to/vllm:/workspace/vllm-patch:ro \ + -v /path/to/inferencex:/workspace \ + vllm/vllm-openai:v0.15.1 \ + bash -c ' + bash /workspace/vllm-patch/docker/apply-reconfigure-overlay.sh + export MODEL=openai/gpt-oss-120b TP=8 CONC=32 ISL=1024 OSL=1024 + bash /workspace/benchmarks/test_reconfigure_sweep.sh + ' +``` ## Safety Notes From a1952975b54bc9a7121be3bc62d5f3eeef0cdae8 Mon Sep 17 00:00:00 2001 From: Jordan Nanos Date: Tue, 14 Apr 2026 14:41:15 -0700 Subject: [PATCH 3/8] Document patched vLLM distribution options --- benchmarks/benchmark_lib.sh | 78 +++++++++++++++++++++++++ docs/vllm-patched-distribution.md | 94 +++++++++++++++++++++++++++++++ 2 files changed, 172 insertions(+) create mode 100644 docs/vllm-patched-distribution.md diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 5754b87f1..e138a93eb 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -851,3 +851,81 @@ run_eval() { fi return $eval_rc } + +# -------------------------------- +# Patched vLLM distribution helpers +# -------------------------------- + +# Install a patched vLLM build before launching `vllm serve`. This is optional +# and only needed when the active container/image does not already include the +# dynamic scheduler reconfiguration API. +# +# Supported env vars: +# VLLM_PATCHED_WHEEL -- local or remote wheel path/URL +# VLLM_PATCHED_REPO -- git repository URL for patched vLLM +# VLLM_PATCHED_REF -- branch, tag, or commit for VLLM_PATCHED_REPO +# VLLM_PATCHED_CHECKOUT -- existing mounted checkout to install editable +# VLLM_PATCHED_INSTALL_MODE -- wheel, git, editable, or auto (default) +install_patched_vllm() { + local mode="${VLLM_PATCHED_INSTALL_MODE:-auto}" + + if [[ "$mode" == "auto" ]]; then + if [[ -n "${VLLM_PATCHED_WHEEL:-}" ]]; then + mode="wheel" + elif [[ -n "${VLLM_PATCHED_CHECKOUT:-}" ]]; then + mode="editable" + elif [[ -n "${VLLM_PATCHED_REPO:-}" ]]; then + mode="git" + else + echo "No patched vLLM install source configured; using existing vLLM" + return 0 + fi + fi + + case "$mode" in + wheel) + if [[ -z "${VLLM_PATCHED_WHEEL:-}" ]]; then + echo "VLLM_PATCHED_INSTALL_MODE=wheel requires VLLM_PATCHED_WHEEL" + return 1 + fi + echo "Installing patched vLLM wheel: $VLLM_PATCHED_WHEEL" + python3 -m pip install --no-cache-dir --force-reinstall "$VLLM_PATCHED_WHEEL" + ;; + git) + if [[ -z "${VLLM_PATCHED_REPO:-}" || -z "${VLLM_PATCHED_REF:-}" ]]; then + echo "VLLM_PATCHED_INSTALL_MODE=git requires VLLM_PATCHED_REPO and VLLM_PATCHED_REF" + return 1 + fi + local checkout_dir="${VLLM_PATCHED_GIT_DIR:-/tmp/patched-vllm}" + rm -rf "$checkout_dir" + git clone --depth 1 --branch "$VLLM_PATCHED_REF" "$VLLM_PATCHED_REPO" "$checkout_dir" || { + git clone "$VLLM_PATCHED_REPO" "$checkout_dir" + git -C "$checkout_dir" checkout "$VLLM_PATCHED_REF" + } + echo "Installing patched vLLM from $VLLM_PATCHED_REPO@$VLLM_PATCHED_REF" + VLLM_USE_PRECOMPILED=${VLLM_USE_PRECOMPILED:-1} \ + python3 -m pip install --no-cache-dir -e "$checkout_dir" + ;; + editable) + if [[ -z "${VLLM_PATCHED_CHECKOUT:-}" ]]; then + echo "VLLM_PATCHED_INSTALL_MODE=editable requires VLLM_PATCHED_CHECKOUT" + return 1 + fi + echo "Installing patched vLLM editable checkout: $VLLM_PATCHED_CHECKOUT" + VLLM_USE_PRECOMPILED=${VLLM_USE_PRECOMPILED:-1} \ + python3 -m pip install --no-cache-dir -e "$VLLM_PATCHED_CHECKOUT" + ;; + *) + echo "Unknown VLLM_PATCHED_INSTALL_MODE: $mode" + return 1 + ;; + esac + + python3 - <<'PY' +import importlib.metadata +try: + print("Installed vLLM version:", importlib.metadata.version("vllm")) +except importlib.metadata.PackageNotFoundError: + print("Installed vLLM version: unknown") +PY +} diff --git a/docs/vllm-patched-distribution.md b/docs/vllm-patched-distribution.md new file mode 100644 index 000000000..ab94f4245 --- /dev/null +++ b/docs/vllm-patched-distribution.md @@ -0,0 +1,94 @@ +# Distributing Patched vLLM for Dynamic Reconfiguration + +Dynamic scheduler reconfiguration requires a vLLM build that includes the +runtime reconfiguration API used by InferenceX: + +- `POST /pause` +- `POST /reconfigure` +- `POST /resume` + +A stock vLLM image or wheel that does not include this API can still run normal +InferenceX benchmarks. It only needs the patched build when +`VLLM_DYNAMIC_RECONFIGURE=1` is enabled. + +## Option 1: Custom Benchmark Container + +Build a container that starts from the normal InferenceX/vLLM benchmark image and +installs the patched vLLM branch or wheel at image build time. This is the most +reproducible option for cluster sweeps. + +Example Dockerfile pattern: + +```Dockerfile +FROM vllm/vllm-openai: + +ARG VLLM_REPO=https://github.com//vllm.git +ARG VLLM_REF= + +RUN git clone "$VLLM_REPO" /opt/patched-vllm \ + && cd /opt/patched-vllm \ + && git checkout "$VLLM_REF" \ + && VLLM_USE_PRECOMPILED=1 python3 -m pip install --no-cache-dir -e . + +LABEL org.opencontainers.image.source-vllm="$VLLM_REPO" +LABEL org.opencontainers.image.revision-vllm="$VLLM_REF" +``` + +Then update the InferenceX config entry to use that image for the vLLM run. + +## Option 2: Install a Pinned Wheel Before `vllm serve` + +Build a patched wheel once, publish it to an artifact store, and install it in +the benchmark job before launching `vllm serve`. + +```bash +export VLLM_PATCHED_INSTALL_MODE=wheel +export VLLM_PATCHED_WHEEL=/workspace/wheels/vllm--patched.whl +install_patched_vllm +``` + +Remote wheel URLs also work if the runner has access: + +```bash +export VLLM_PATCHED_WHEEL=https://example.internal/wheels/vllm-patched.whl +install_patched_vllm +``` + +## Option 3: Mount a Patched Checkout and Install Editable + +This is useful for fast experiments, but less reproducible than an image or +wheel. Mount the patched vLLM checkout into the job and run: + +```bash +export VLLM_PATCHED_INSTALL_MODE=editable +export VLLM_PATCHED_CHECKOUT=/workspace/vllm-patched +install_patched_vllm +``` + +## Option 4: Clone and Install a Pinned Git Ref + +This is convenient when the cluster has network access to the patched branch: + +```bash +export VLLM_PATCHED_INSTALL_MODE=git +export VLLM_PATCHED_REPO=https://github.com//vllm.git +export VLLM_PATCHED_REF= +install_patched_vllm +``` + +For reproducible benchmark results, prefer a commit SHA over a mutable branch. + +## Recommended Cluster Flow + +1. Start from a custom image or call `install_patched_vllm` before `vllm serve`. +2. Launch vLLM with the largest static capacity needed by the sweep. +3. Enable scheduler reconfiguration for the sweep: + +```bash +export VLLM_DYNAMIC_RECONFIGURE=1 +export VLLM_MAX_NUM_BATCHED_TOKENS=32768 +export VLLM_MAX_NUM_SEQS="$CONC" +``` + +4. Run `run_benchmark_serving` as usual. +5. Record the patched vLLM commit SHA or wheel URL in the run notes/results. From d35f6c331431b618d4cf7b69157d09d8b4949736 Mon Sep 17 00:00:00 2001 From: Jordan Nanos Date: Tue, 14 Apr 2026 14:44:18 -0700 Subject: [PATCH 4/8] Fix pause mode: use mode=keep (required by reconfigure API) The vLLM /reconfigure endpoint requires PAUSED_ALL state, which maps to pause mode="keep". Using mode="abort" would leave the scheduler in PAUSED_NEW state, causing reconfigure to reject the request. Co-Authored-By: Claude Opus 4.6 (1M context) --- benchmarks/benchmark_lib.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index e138a93eb..ba9c39ae9 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -44,7 +44,7 @@ reconfigure_vllm_scheduler() { fi echo "Reconfiguring vLLM scheduler at $base_url: $json" - curl -fsS -X POST "$base_url/pause?mode=abort&clear_cache=true" + curl -fsS -X POST "$base_url/pause?mode=keep&clear_cache=true" curl -fsS -X POST "$base_url/reconfigure" \ -H "Content-Type: application/json" \ -d "$json" From f9aa78c4fd4fb7c9eaa138eee7d260d299532474 Mon Sep 17 00:00:00 2001 From: Jordan Nanos Date: Tue, 14 Apr 2026 14:45:58 -0700 Subject: [PATCH 5/8] Fix four bugs in reconfigure integration 1. Double reconfigure in test_reconfigure_sweep.sh: Phase B called reconfigure_vllm_scheduler manually then run_benchmark_serving called it again via the VLLM_DYNAMIC_RECONFIGURE hook. Remove the manual call and let the hook handle it. 2. Doc listed mode=abort but vLLM /reconfigure requires PAUSED_ALL which maps to mode=keep. Fix the Requirements section. 3. No error recovery in reconfigure_vllm_scheduler: if /reconfigure failed, curl exited non-zero, set -e killed the function, and the server stayed paused forever. Now capture the exit code, always call /resume, then propagate the error. 4. --force-reinstall in wheel mode reinstalls all dependencies. Use --no-deps --force-reinstall to only replace the vllm package. Co-Authored-By: Claude Opus 4.6 (1M context) --- benchmarks/benchmark_lib.sh | 13 +++++++++++-- benchmarks/test_reconfigure_sweep.sh | 7 +++---- docs/vllm-dynamic-scheduler-reconfigure.md | 2 +- 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index ba9c39ae9..608261a06 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -45,10 +45,19 @@ reconfigure_vllm_scheduler() { echo "Reconfiguring vLLM scheduler at $base_url: $json" curl -fsS -X POST "$base_url/pause?mode=keep&clear_cache=true" + + local rc=0 curl -fsS -X POST "$base_url/reconfigure" \ -H "Content-Type: application/json" \ - -d "$json" + -d "$json" || rc=$? + + # Always resume so the server is never left paused on failure. curl -fsS -X POST "$base_url/resume" + + if [[ "$rc" -ne 0 ]]; then + echo "ERROR: /reconfigure failed (curl exit code $rc)" >&2 + return "$rc" + fi } # -------------------------------- @@ -889,7 +898,7 @@ install_patched_vllm() { return 1 fi echo "Installing patched vLLM wheel: $VLLM_PATCHED_WHEEL" - python3 -m pip install --no-cache-dir --force-reinstall "$VLLM_PATCHED_WHEEL" + python3 -m pip install --no-cache-dir --no-deps --force-reinstall "$VLLM_PATCHED_WHEEL" ;; git) if [[ -z "${VLLM_PATCHED_REPO:-}" || -z "${VLLM_PATCHED_REF:-}" ]]; then diff --git a/benchmarks/test_reconfigure_sweep.sh b/benchmarks/test_reconfigure_sweep.sh index 815db2355..3a7d5f08d 100755 --- a/benchmarks/test_reconfigure_sweep.sh +++ b/benchmarks/test_reconfigure_sweep.sh @@ -165,13 +165,12 @@ if [[ "${SKIP_RECONFIG:-0}" != "1" ]]; then RECONF_START=$(date +%s) + # Set env vars — run_benchmark_serving picks these up via + # the VLLM_DYNAMIC_RECONFIGURE hook and calls + # reconfigure_vllm_scheduler automatically. export VLLM_DYNAMIC_RECONFIGURE=1 export VLLM_MAX_NUM_BATCHED_TOKENS="$mnb" export VLLM_MAX_NUM_SEQS="$mns" - reconfigure_vllm_scheduler "$PORT" - - RECONF_END=$(date +%s) - echo " Reconfigure: $((RECONF_END - RECONF_START))s" run_bench "$RESULTS_B" "reconfig_mnb${mnb}_mns${mns}" diff --git a/docs/vllm-dynamic-scheduler-reconfigure.md b/docs/vllm-dynamic-scheduler-reconfigure.md index 7db5c7b07..75dbd90bd 100644 --- a/docs/vllm-dynamic-scheduler-reconfigure.md +++ b/docs/vllm-dynamic-scheduler-reconfigure.md @@ -10,7 +10,7 @@ admission limits. This feature requires a vLLM build that exposes these HTTP endpoints: -- `POST /pause?mode=abort&clear_cache=true` +- `POST /pause?mode=keep&clear_cache=true` - `POST /reconfigure` (JSON body) - `POST /resume` From 7c5859db83b02a687ab98eb86dc4b1dfa65e14df Mon Sep 17 00:00:00 2001 From: Jordan Nanos Date: Tue, 14 Apr 2026 15:08:15 -0700 Subject: [PATCH 6/8] Update docs with pre-built Docker Hub image and run instructions Image: semianalysiswork/vllm-reconfigure:latest Based on vllm/vllm-openai:v0.18.0 with reconfigure API overlay. Co-Authored-By: Claude Opus 4.6 (1M context) --- docs/vllm-dynamic-scheduler-reconfigure.md | 52 ++++++++++++++-------- 1 file changed, 33 insertions(+), 19 deletions(-) diff --git a/docs/vllm-dynamic-scheduler-reconfigure.md b/docs/vllm-dynamic-scheduler-reconfigure.md index 75dbd90bd..8d80a3d49 100644 --- a/docs/vllm-dynamic-scheduler-reconfigure.md +++ b/docs/vllm-dynamic-scheduler-reconfigure.md @@ -58,32 +58,46 @@ export MODEL=openai/gpt-oss-120b TP=8 CONC=32 ISL=1024 OSL=1024 bash benchmarks/test_reconfigure_sweep.sh ``` -## Building the Patched Image +## Pre-built Image -The changes are pure Python -- no C++/CUDA recompilation needed. Overlay them -onto the stock vLLM image: +A pre-built image is available on Docker Hub: -```bash -# From the vllm repo root (on the branch with the reconfigure patch) -docker build -f docker/Dockerfile.reconfigure-overlay \ - -t ghcr.io/semianalysisai/vllm-reconfigure:test1 . + semianalysiswork/vllm-reconfigure:latest + +This overlays the reconfigure API onto `vllm/vllm-openai:v0.18.0`. +Source: [JordanNanos/vllm `feature/reconfigure-scheduler`](https://github.com/JordanNanos/vllm/tree/feature/reconfigure-scheduler), +Dockerfile: `docker/Dockerfile.single-node-nvidia`. + +## Running a Single-Node Test -docker push ghcr.io/semianalysisai/vllm-reconfigure:test1 +```bash +docker run --rm --init --network host \ + --runtime nvidia --gpus all --ipc host --privileged \ + --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \ + -v $HF_HUB_CACHE:/root/.cache/huggingface \ + -v $(pwd):/workspace -w /workspace \ + -e HF_TOKEN -e PORT=8888 \ + -e MODEL=openai/gpt-oss-120b \ + -e TP=8 -e CONC=32 \ + -e ISL=1024 -e OSL=1024 \ + semianalysiswork/vllm-reconfigure:latest \ + bash benchmarks/test_reconfigure_sweep.sh ``` -Or patch at runtime by mounting the vLLM checkout and running the overlay script -at the top of the benchmark: +Use `SKIP_BASELINE=1` or `SKIP_RECONFIG=1` to run only one phase. +Pass extra vLLM flags via `VLLM_EXTRA_ARGS` (e.g. `--kv-cache-dtype fp8`). + +## Building the Image From Source + +To rebuild from the vLLM fork: ```bash -docker run --gpus all --rm -it --network host --shm-size 64g \ - -v /path/to/vllm:/workspace/vllm-patch:ro \ - -v /path/to/inferencex:/workspace \ - vllm/vllm-openai:v0.15.1 \ - bash -c ' - bash /workspace/vllm-patch/docker/apply-reconfigure-overlay.sh - export MODEL=openai/gpt-oss-120b TP=8 CONC=32 ISL=1024 OSL=1024 - bash /workspace/benchmarks/test_reconfigure_sweep.sh - ' +git clone https://github.com/JordanNanos/vllm.git -b feature/reconfigure-scheduler +cd vllm +docker build --platform linux/amd64 \ + -f docker/Dockerfile.single-node-nvidia \ + --build-arg BASE_IMAGE=vllm/vllm-openai:v0.18.0 \ + -t semianalysiswork/vllm-reconfigure:latest . ``` ## Safety Notes From c0859c175de6a5be6f25e999f1a1effe4acdf78e Mon Sep 17 00:00:00 2001 From: Jordan Nanos Date: Tue, 14 Apr 2026 17:54:10 -0700 Subject: [PATCH 7/8] Add workflow for scheduler reconfigure A/B test Standalone workflow_dispatch workflow that runs benchmarks/test_reconfigure_sweep.sh on any GPU runner using the semianalysiswork/vllm-reconfigure:latest image. Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/workflows/test-reconfigure.yml | 116 +++++++++++++++++++++++++ 1 file changed, 116 insertions(+) create mode 100644 .github/workflows/test-reconfigure.yml diff --git a/.github/workflows/test-reconfigure.yml b/.github/workflows/test-reconfigure.yml new file mode 100644 index 000000000..2d63b0ff8 --- /dev/null +++ b/.github/workflows/test-reconfigure.yml @@ -0,0 +1,116 @@ +name: Test - Scheduler Reconfigure A/B +run-name: "Reconfigure A/B | ${{ inputs.model }} tp=${{ inputs.tp }} conc=${{ inputs.conc }} | ${{ inputs.runner }}" + +on: + workflow_dispatch: + inputs: + runner: + description: "Self-hosted runner label (e.g. b200-dgxc_1)" + required: true + type: string + model: + description: "HuggingFace model path" + required: true + type: string + default: "openai/gpt-oss-120b" + tp: + description: "Tensor parallel size" + required: true + type: string + default: "8" + conc: + description: "Benchmark concurrency" + required: true + type: string + default: "32" + isl: + description: "Input sequence length" + required: true + type: string + default: "1024" + osl: + description: "Output sequence length" + required: true + type: string + default: "1024" + extra-args: + description: "Extra vllm serve args (e.g. --kv-cache-dtype fp8)" + required: false + type: string + skip-baseline: + description: "Skip Phase A (baseline cold starts)" + required: false + type: boolean + default: false + skip-reconfig: + description: "Skip Phase B (reconfigure)" + required: false + type: boolean + default: false + +env: + IMAGE: semianalysiswork/vllm-reconfigure:latest + +jobs: + reconfigure-ab-test: + runs-on: ${{ inputs.runner }} + timeout-minutes: 300 + name: "reconfigure-ab | ${{ inputs.model }} | ${{ inputs.runner }}" + steps: + - name: Resource cleanup (pre-run) + run: | + if command -v docker >/dev/null 2>&1 && docker info >/dev/null 2>&1; then + docker ps -aq | xargs -r docker rm -f || true + docker network prune -f || true + fi + + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + ref: ${{ github.ref }} + clean: false + + - name: Pull image + run: docker pull "$IMAGE" + + - name: Run A/B test + run: | + set -x + HF_HUB_CACHE_MOUNT="/raid/hf_hub_cache/" + + docker run --rm --init --network host \ + --runtime nvidia --gpus all --ipc host --privileged \ + --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \ + -v "${HF_HUB_CACHE_MOUNT}:/root/.cache/huggingface" \ + -v "$GITHUB_WORKSPACE:/workspace" -w /workspace \ + -e HF_TOKEN \ + -e MODEL="${{ inputs.model }}" \ + -e TP="${{ inputs.tp }}" \ + -e CONC="${{ inputs.conc }}" \ + -e ISL="${{ inputs.isl }}" \ + -e OSL="${{ inputs.osl }}" \ + -e PORT=8888 \ + -e RANDOM_RANGE_RATIO=0.8 \ + -e VLLM_EXTRA_ARGS="${{ inputs.extra-args }}" \ + -e SKIP_BASELINE="${{ inputs.skip-baseline && '1' || '0' }}" \ + -e SKIP_RECONFIG="${{ inputs.skip-reconfig && '1' || '0' }}" \ + -e RESULT_FILENAME="reconfigure_ab_test" \ + -e NCCL_GRAPH_REGISTER=0 \ + -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \ + "$IMAGE" \ + bash benchmarks/test_reconfigure_sweep.sh + + - name: Upload results + if: always() + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 + with: + name: reconfigure-ab-results-${{ inputs.runner }} + path: | + results_reconfigure_test/ + retention-days: 30 + + - name: Resource cleanup (post-run) + if: always() + run: | + if command -v docker >/dev/null 2>&1 && docker info >/dev/null 2>&1; then + docker ps -aq | xargs -r docker rm -f || true + fi From fd4704636a4c0ba5763e32df6d4254895520e477 Mon Sep 17 00:00:00 2001 From: Jordan Nanos Date: Tue, 14 Apr 2026 17:58:09 -0700 Subject: [PATCH 8/8] Potential fix for pull request finding 'CodeQL / Workflow does not contain permissions' Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com> --- .github/workflows/test-reconfigure.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/test-reconfigure.yml b/.github/workflows/test-reconfigure.yml index 2d63b0ff8..e701c8408 100644 --- a/.github/workflows/test-reconfigure.yml +++ b/.github/workflows/test-reconfigure.yml @@ -48,6 +48,9 @@ on: type: boolean default: false +permissions: + contents: read + env: IMAGE: semianalysiswork/vllm-reconfigure:latest