-
Notifications
You must be signed in to change notification settings - Fork 134
Add vLLM dynamic scheduler reconfigure for single-server sweeps #1029
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
dd1d58d
51f3290
a195297
d35f6c3
f9aa78c
7c5859d
c0859c1
fd47046
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,119 @@ | ||
| name: Test - Scheduler Reconfigure A/B | ||
| run-name: "Reconfigure A/B | ${{ inputs.model }} tp=${{ inputs.tp }} conc=${{ inputs.conc }} | ${{ inputs.runner }}" | ||
|
|
||
| on: | ||
| workflow_dispatch: | ||
| inputs: | ||
| runner: | ||
| description: "Self-hosted runner label (e.g. b200-dgxc_1)" | ||
| required: true | ||
| type: string | ||
| model: | ||
| description: "HuggingFace model path" | ||
| required: true | ||
| type: string | ||
| default: "openai/gpt-oss-120b" | ||
| tp: | ||
| description: "Tensor parallel size" | ||
| required: true | ||
| type: string | ||
| default: "8" | ||
| conc: | ||
| description: "Benchmark concurrency" | ||
| required: true | ||
| type: string | ||
| default: "32" | ||
| isl: | ||
| description: "Input sequence length" | ||
| required: true | ||
| type: string | ||
| default: "1024" | ||
| osl: | ||
| description: "Output sequence length" | ||
| required: true | ||
| type: string | ||
| default: "1024" | ||
| extra-args: | ||
| description: "Extra vllm serve args (e.g. --kv-cache-dtype fp8)" | ||
| required: false | ||
| type: string | ||
| skip-baseline: | ||
| description: "Skip Phase A (baseline cold starts)" | ||
| required: false | ||
| type: boolean | ||
| default: false | ||
| skip-reconfig: | ||
| description: "Skip Phase B (reconfigure)" | ||
| required: false | ||
| type: boolean | ||
| default: false | ||
|
|
||
| permissions: | ||
| contents: read | ||
|
|
||
| env: | ||
| IMAGE: semianalysiswork/vllm-reconfigure:latest | ||
|
|
||
| jobs: | ||
| reconfigure-ab-test: | ||
| runs-on: ${{ inputs.runner }} | ||
| timeout-minutes: 300 | ||
| name: "reconfigure-ab | ${{ inputs.model }} | ${{ inputs.runner }}" | ||
| steps: | ||
| - name: Resource cleanup (pre-run) | ||
| run: | | ||
| if command -v docker >/dev/null 2>&1 && docker info >/dev/null 2>&1; then | ||
| docker ps -aq | xargs -r docker rm -f || true | ||
| docker network prune -f || true | ||
| fi | ||
|
|
||
| - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 | ||
| with: | ||
| ref: ${{ github.ref }} | ||
| clean: false | ||
|
|
||
| - name: Pull image | ||
| run: docker pull "$IMAGE" | ||
|
|
||
| - name: Run A/B test | ||
| run: | | ||
| set -x | ||
| HF_HUB_CACHE_MOUNT="/raid/hf_hub_cache/" | ||
|
|
||
| docker run --rm --init --network host \ | ||
| --runtime nvidia --gpus all --ipc host --privileged \ | ||
| --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \ | ||
| -v "${HF_HUB_CACHE_MOUNT}:/root/.cache/huggingface" \ | ||
| -v "$GITHUB_WORKSPACE:/workspace" -w /workspace \ | ||
| -e HF_TOKEN \ | ||
| -e MODEL="${{ inputs.model }}" \ | ||
| -e TP="${{ inputs.tp }}" \ | ||
| -e CONC="${{ inputs.conc }}" \ | ||
| -e ISL="${{ inputs.isl }}" \ | ||
| -e OSL="${{ inputs.osl }}" \ | ||
| -e PORT=8888 \ | ||
| -e RANDOM_RANGE_RATIO=0.8 \ | ||
| -e VLLM_EXTRA_ARGS="${{ inputs.extra-args }}" \ | ||
| -e SKIP_BASELINE="${{ inputs.skip-baseline && '1' || '0' }}" \ | ||
| -e SKIP_RECONFIG="${{ inputs.skip-reconfig && '1' || '0' }}" \ | ||
| -e RESULT_FILENAME="reconfigure_ab_test" \ | ||
| -e NCCL_GRAPH_REGISTER=0 \ | ||
| -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \ | ||
| "$IMAGE" \ | ||
| bash benchmarks/test_reconfigure_sweep.sh | ||
|
|
||
| - name: Upload results | ||
| if: always() | ||
| uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 | ||
| with: | ||
| name: reconfigure-ab-results-${{ inputs.runner }} | ||
| path: | | ||
| results_reconfigure_test/ | ||
| retention-days: 30 | ||
|
|
||
| - name: Resource cleanup (post-run) | ||
| if: always() | ||
| run: | | ||
| if command -v docker >/dev/null 2>&1 && docker info >/dev/null 2>&1; then | ||
| docker ps -aq | xargs -r docker rm -f || true | ||
| fi | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -9,6 +9,57 @@ export PYTHONDONTWRITEBYTECODE=1 | |
| export PYTHONPYCACHEPREFIX="${PYTHONPYCACHEPREFIX:-/tmp/inferencex-pycache}" | ||
| mkdir -p "$PYTHONPYCACHEPREFIX" 2>/dev/null || true | ||
|
|
||
| # -------------------------------- | ||
| # vLLM dynamic scheduler reconfiguration | ||
| # -------------------------------- | ||
|
|
||
| # Reconfigure vLLM scheduler limits on a running endpoint. This requires a vLLM | ||
| # build that exposes POST /pause, POST /reconfigure, and POST /resume. | ||
| # The feature is opt-in via VLLM_DYNAMIC_RECONFIGURE=1 and is intended for | ||
| # single-server sweeps where model, parallelism, and cache layout stay fixed. | ||
| # | ||
| # Supported env vars (set before calling): | ||
| # VLLM_MAX_NUM_BATCHED_TOKENS -- max tokens scheduled per step | ||
| # VLLM_MAX_NUM_SEQS -- max concurrent sequences | ||
| reconfigure_vllm_scheduler() { | ||
| local port="$1" | ||
| local base_url="${VLLM_DYNAMIC_RECONFIGURE_BASE_URL:-http://0.0.0.0:$port}" | ||
|
|
||
| # Build JSON body from set env vars | ||
| local json="{" | ||
| local sep="" | ||
| if [[ -n "${VLLM_MAX_NUM_BATCHED_TOKENS:-}" ]]; then | ||
| json+="${sep}\"max_num_batched_tokens\":${VLLM_MAX_NUM_BATCHED_TOKENS}" | ||
| sep="," | ||
| fi | ||
| if [[ -n "${VLLM_MAX_NUM_SEQS:-}" ]]; then | ||
| json+="${sep}\"max_num_seqs\":${VLLM_MAX_NUM_SEQS}" | ||
| sep="," | ||
| fi | ||
| json+="}" | ||
|
|
||
|
Comment on lines
+38
to
+40
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🔴 The three curl calls in reconfigure_vllm_scheduler() are not chained with &&, so a failure of /reconfigure_scheduler is masked by the subsequent /resume success — the function returns 0 even when reconfiguration failed. Additionally, the call site in run_benchmark_serving() ignores the return value entirely, so the benchmark always proceeds regardless of reconfiguration outcome, silently producing incorrect results with stale scheduler settings. Extended reasoning...Bug 1 — internal error masking in reconfigure_vllm_scheduler() (lines 38-40): In bash, a function's return code is the exit code of its last executed command. The three curl calls run unconditionally with no chaining: curl -fsS -X POST "$base_url/pause?mode=keep"
curl -fsS -X POST -G "$base_url/reconfigure_scheduler" "${params[@]}"
curl -fsS -X POST "$base_url/resume"The Bug 2 — return value ignored at the call site in run_benchmark_serving() (~line 361): if [[ "${VLLM_DYNAMIC_RECONFIGURE:-0}" == "1" && "$backend" == "vllm" ]]; then
reconfigure_vllm_scheduler "$port"
fiThere is no Combined effect — step-by-step proof:
Fix: Chain the curl calls with # Inside reconfigure_vllm_scheduler():
curl -fsS -X POST "$base_url/pause?mode=keep" && curl -fsS -X POST -G "$base_url/reconfigure_scheduler" "${params[@]}" && curl -fsS -X POST "$base_url/resume"
# At the call site in run_benchmark_serving():
reconfigure_vllm_scheduler "$port" || return 1 |
||
| if [[ "$json" == "{}" ]]; then | ||
| echo "VLLM_DYNAMIC_RECONFIGURE=1 but no VLLM scheduler parameters were set" | ||
| return 1 | ||
| fi | ||
|
|
||
| echo "Reconfiguring vLLM scheduler at $base_url: $json" | ||
| curl -fsS -X POST "$base_url/pause?mode=keep&clear_cache=true" | ||
|
|
||
| local rc=0 | ||
| curl -fsS -X POST "$base_url/reconfigure" \ | ||
| -H "Content-Type: application/json" \ | ||
| -d "$json" || rc=$? | ||
|
|
||
| # Always resume so the server is never left paused on failure. | ||
| curl -fsS -X POST "$base_url/resume" | ||
|
|
||
| if [[ "$rc" -ne 0 ]]; then | ||
| echo "ERROR: /reconfigure failed (curl exit code $rc)" >&2 | ||
| return "$rc" | ||
| fi | ||
| } | ||
|
|
||
| # -------------------------------- | ||
| # GPU monitoring helpers | ||
| # -------------------------------- | ||
|
|
@@ -326,6 +377,10 @@ run_benchmark_serving() { | |
| num_prompts="$max_concurrency" | ||
| fi | ||
|
|
||
| if [[ "${VLLM_DYNAMIC_RECONFIGURE:-0}" == "1" && "$backend" == "vllm" ]]; then | ||
| reconfigure_vllm_scheduler "$port" | ||
| fi | ||
|
|
||
| # Build benchmark command | ||
| local benchmark_cmd=( | ||
| python3 "$workspace_dir/utils/bench_serving/benchmark_serving.py" | ||
|
|
@@ -805,3 +860,81 @@ run_eval() { | |
| fi | ||
| return $eval_rc | ||
| } | ||
|
|
||
| # -------------------------------- | ||
| # Patched vLLM distribution helpers | ||
| # -------------------------------- | ||
|
|
||
| # Install a patched vLLM build before launching `vllm serve`. This is optional | ||
| # and only needed when the active container/image does not already include the | ||
| # dynamic scheduler reconfiguration API. | ||
| # | ||
| # Supported env vars: | ||
| # VLLM_PATCHED_WHEEL -- local or remote wheel path/URL | ||
| # VLLM_PATCHED_REPO -- git repository URL for patched vLLM | ||
| # VLLM_PATCHED_REF -- branch, tag, or commit for VLLM_PATCHED_REPO | ||
| # VLLM_PATCHED_CHECKOUT -- existing mounted checkout to install editable | ||
| # VLLM_PATCHED_INSTALL_MODE -- wheel, git, editable, or auto (default) | ||
| install_patched_vllm() { | ||
| local mode="${VLLM_PATCHED_INSTALL_MODE:-auto}" | ||
|
|
||
| if [[ "$mode" == "auto" ]]; then | ||
| if [[ -n "${VLLM_PATCHED_WHEEL:-}" ]]; then | ||
| mode="wheel" | ||
| elif [[ -n "${VLLM_PATCHED_CHECKOUT:-}" ]]; then | ||
| mode="editable" | ||
| elif [[ -n "${VLLM_PATCHED_REPO:-}" ]]; then | ||
| mode="git" | ||
| else | ||
| echo "No patched vLLM install source configured; using existing vLLM" | ||
| return 0 | ||
| fi | ||
| fi | ||
|
|
||
| case "$mode" in | ||
| wheel) | ||
| if [[ -z "${VLLM_PATCHED_WHEEL:-}" ]]; then | ||
| echo "VLLM_PATCHED_INSTALL_MODE=wheel requires VLLM_PATCHED_WHEEL" | ||
| return 1 | ||
| fi | ||
| echo "Installing patched vLLM wheel: $VLLM_PATCHED_WHEEL" | ||
| python3 -m pip install --no-cache-dir --no-deps --force-reinstall "$VLLM_PATCHED_WHEEL" | ||
| ;; | ||
| git) | ||
| if [[ -z "${VLLM_PATCHED_REPO:-}" || -z "${VLLM_PATCHED_REF:-}" ]]; then | ||
| echo "VLLM_PATCHED_INSTALL_MODE=git requires VLLM_PATCHED_REPO and VLLM_PATCHED_REF" | ||
| return 1 | ||
| fi | ||
| local checkout_dir="${VLLM_PATCHED_GIT_DIR:-/tmp/patched-vllm}" | ||
| rm -rf "$checkout_dir" | ||
| git clone --depth 1 --branch "$VLLM_PATCHED_REF" "$VLLM_PATCHED_REPO" "$checkout_dir" || { | ||
| git clone "$VLLM_PATCHED_REPO" "$checkout_dir" | ||
| git -C "$checkout_dir" checkout "$VLLM_PATCHED_REF" | ||
| } | ||
| echo "Installing patched vLLM from $VLLM_PATCHED_REPO@$VLLM_PATCHED_REF" | ||
| VLLM_USE_PRECOMPILED=${VLLM_USE_PRECOMPILED:-1} \ | ||
| python3 -m pip install --no-cache-dir -e "$checkout_dir" | ||
| ;; | ||
| editable) | ||
| if [[ -z "${VLLM_PATCHED_CHECKOUT:-}" ]]; then | ||
| echo "VLLM_PATCHED_INSTALL_MODE=editable requires VLLM_PATCHED_CHECKOUT" | ||
| return 1 | ||
| fi | ||
| echo "Installing patched vLLM editable checkout: $VLLM_PATCHED_CHECKOUT" | ||
| VLLM_USE_PRECOMPILED=${VLLM_USE_PRECOMPILED:-1} \ | ||
| python3 -m pip install --no-cache-dir -e "$VLLM_PATCHED_CHECKOUT" | ||
| ;; | ||
| *) | ||
| echo "Unknown VLLM_PATCHED_INSTALL_MODE: $mode" | ||
| return 1 | ||
| ;; | ||
| esac | ||
|
|
||
| python3 - <<'PY' | ||
| import importlib.metadata | ||
| try: | ||
| print("Installed vLLM version:", importlib.metadata.version("vllm")) | ||
| except importlib.metadata.PackageNotFoundError: | ||
| print("Installed vLLM version: unknown") | ||
| PY | ||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.