From 1f3d8dd2f5620d12b2749058ed1451a17554dfdb Mon Sep 17 00:00:00 2001 From: "Albert Cheng (Engrg-Hardware 1)" Date: Wed, 10 Jun 2026 18:07:34 -0700 Subject: [PATCH 1/2] =?UTF-8?q?[NV]=20Add=20SpeedBench=20AL=20collectors?= =?UTF-8?q?=20for=20DSR1=20/=20GLM-5=20/=20Qwen3.5=20(B300=20vLLM=20MTP)?= =?UTF-8?q?=20Mirror=20the=20merged=20dsv4=20collector=20(#1650)=20for=20t?= =?UTF-8?q?hree=20more=20models,=20reusing=20the=20same=20speedbench-al.ym?= =?UTF-8?q?l=20workflow=20(model/model-prefix=20are=20inputs;=20no=20workf?= =?UTF-8?q?low=20or=20launcher=20change=20=E2=80=94=20all=20three=20are=20?= =?UTF-8?q?already=20in=20launch=5Fb300-nv.sh=20STAGED=5FMODELS).=20Per-mo?= =?UTF-8?q?del=20serve=20args=20match=20the=20locally-validated=20scripts;?= =?UTF-8?q?=20glm5+qwen3.5=20apply=20the=20#1695=20CHAT=5FTEMPLATE=5FKWARG?= =?UTF-8?q?S=20quoting=20fix;=20dsr1=20is=20thinking-on=20only.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../speedbench/dsr1_fp4_b300_vllm.sh | 242 ++++++++++++ .../speedbench/glm5_fp4_b300_vllm.sh | 359 +++++++++++++++++ .../speedbench/qwen3.5_fp4_b300_vllm.sh | 365 ++++++++++++++++++ 3 files changed, 966 insertions(+) create mode 100755 benchmarks/single_node/speedbench/dsr1_fp4_b300_vllm.sh create mode 100755 benchmarks/single_node/speedbench/glm5_fp4_b300_vllm.sh create mode 100755 benchmarks/single_node/speedbench/qwen3.5_fp4_b300_vllm.sh diff --git a/benchmarks/single_node/speedbench/dsr1_fp4_b300_vllm.sh b/benchmarks/single_node/speedbench/dsr1_fp4_b300_vllm.sh new file mode 100755 index 000000000..aee0bb967 --- /dev/null +++ b/benchmarks/single_node/speedbench/dsr1_fp4_b300_vllm.sh @@ -0,0 +1,242 @@ +#!/usr/bin/env bash + +# DeepSeek-R1 B300 vLLM SPEED-Bench AL matrix collector. +# +# Produces the golden acceptance-length (AL) reference matrix consumed by the +# synthetic-acceptance framework: for each MTP level (num_speculative_tokens), +# measure the REAL AL on a single SPEED-Bench category (default: coding) and emit +# a YAML matrix identical in shape to benchmarks/speedbench-reference-al.yaml. +# This measures real MTP acceptance; the synthetic value is injected downstream +# by the throughput recipe, not here. +# +# Adapted from speedbench/dsv4_fp4_b300_vllm.sh. DeepSeek-R1 is DeepSeek-V3 +# architecture (MLA dense attention), NOT V4 (DSA / Lightning Indexer), so vs the +# DSV4 collector: +# - NO --tokenizer-mode deepseek_v4 / --reasoning-parser deepseek_v4 / +# --tool-call-parser deepseek_v4 (all V4-specific; the official vLLM R1 +# serve command is bare). reasoning-parser is irrelevant here anyway: AL is +# read from /metrics, not from parsed output. +# - NO --attention_config.use_fp4_indexer_cache (that knob is dsv32/MLA-indexer +# only; R1 is plain MLA and never reads it). +# - NO --block-size / --compilation-config (the official R1 recipe omits them; +# defaults apply). --kv-cache-dtype fp8 IS kept, to match the dsv4/qwen/glm +# collectors so all golden AL values share one kv-cache numeric regime. +# - FP4 on Blackwell needs FlashInfer MoE: export VLLM_USE_FLASHINFER_MOE_FP4=1. +# - THINKING: R1 is a pure reasoning model and always emits (its chat +# template has no enable_thinking toggle). There is no thinking-off mode, so +# this collector measures thinking_on only and needs no --chat-template-kwargs +# shim (the default client-side template render already enables thinking). +# +# Checkpoint (B300 / Blackwell): NVFP4 build nvidia/DeepSeek-R1-0528-NVFP4-v2, +# basename dsr1-fp4 on the runner (resolved by launch_b300-nv.sh). +# +# Usage (inside the vLLM container, on a B300 node): +# export MODEL=/data/models/dsr1-fp4 +# bash benchmarks/single_node/speedbench/dsr1_fp4_b300_vllm.sh +# +# Tunables (env): +# MTP_LIST space-separated MTP levels (default "1 2 3 4 5 6 7 8") +# THINKING_MODES space-separated: on (default "on"; R1 has no off) +# CATEGORY SPEED-Bench category (default coding) +# SPEEDBENCH_OUTPUT_LEN per-request output len (default 4096) +# OUT_YAML output matrix path (default $RESULTS_DIR/speedbench-reference-al.yaml) + +set -uo pipefail +source "$(dirname "$0")/../../benchmark_lib.sh" + +MODEL="${MODEL:?MODEL env var required (e.g. /data/models/dsr1-fp4)}" +SERVE_MODEL="${MODEL_PATH:-$MODEL}" +TP="${TP:-8}" +DP_ATTENTION="${DP_ATTENTION:-false}" +PORT="${PORT:-8888}" + +MTP_LIST="${MTP_LIST:-1 2 3 4 5 6 7 8}" +THINKING_MODES="${THINKING_MODES:-on}" +CATEGORY="${CATEGORY:-coding}" +MODEL_KEY="${MODEL_KEY:-$(basename "$SERVE_MODEL" | tr '[:upper:]' '[:lower:]')}" +SPEEDBENCH_OUTPUT_LEN="${SPEEDBENCH_OUTPUT_LEN:-4096}" +CONCURRENCY="${CONCURRENCY:-1}" +# Provider-recommended sampling from the DeepSeek-R1 checkpoint generation_config +# (temperature 0.6, top_p 0.95; no top_k). vLLM's own default top_p is 1.0, so it +# MUST be passed explicitly or the measured AL is taken at the wrong settings. +TEMPERATURE="${TEMPERATURE:-0.6}" +TOP_P="${TOP_P:-0.95}" + +SPEEDBENCH_DIR="${SPEEDBENCH_DIR:-/workspace/speed_bench_data}" +# Flat results dir to match the speedbench-al.yml artifact glob +# (speedbench_results/server_*.log) and its pre-run `rm -rf speedbench_results`. +RESULTS_DIR="${RESULTS_DIR:-/workspace/speedbench_results}" +OUT_YAML="${OUT_YAML:-$RESULTS_DIR/speedbench-reference-al.yaml}" + +# Blackwell FP4 MoE path (DeepSeek-R1 FP4 on B-series): required per vLLM R1 docs. +export VLLM_USE_FLASHINFER_MOE_FP4="${VLLM_USE_FLASHINFER_MOE_FP4:-1}" +export VLLM_ENGINE_READY_TIMEOUT_S=3600 + +mkdir -p "$RESULTS_DIR" +nvidia-smi +if [[ "$SERVE_MODEL" != /* ]]; then hf download "$SERVE_MODEL"; fi + +# ---- Download SPEED-Bench dataset ---- +echo "=== Downloading SPEED-Bench dataset ===" +pip install -q datasets tiktoken +curl -LsSf https://raw.githubusercontent.com/NVIDIA-NeMo/Skills/refs/heads/main/nemo_skills/dataset/speed-bench/prepare.py \ + | python3 - --config qualitative --output_dir "$SPEEDBENCH_DIR" + +if [[ ! -f "$SPEEDBENCH_DIR/qualitative.jsonl" ]]; then + echo "CRITICAL: SPEED-Bench download failed — $SPEEDBENCH_DIR/qualitative.jsonl not found" + exit 1 +fi + +PARALLEL_ARGS=(--tensor-parallel-size "$TP" --data-parallel-size 1) +if [ "${DP_ATTENTION}" = "true" ]; then + PARALLEL_ARGS=(--tensor-parallel-size 1 --data-parallel-size "$TP") +fi + +fetch_metric() { + local port="$1" name="$2" + curl -s "http://localhost:${port}/metrics" \ + | grep -oP "${name}\\{[^}]*\\} \\K[0-9.]+" || echo "0" +} + +SERVER_PID="" +_descendants() { + local pid="$1" child + for child in $(pgrep -P "$pid" 2>/dev/null || true); do + echo "$child" + _descendants "$child" + done +} +cleanup_server() { + if [[ -n "$SERVER_PID" ]]; then + local descendants + descendants=$(_descendants "$SERVER_PID") + kill "$SERVER_PID" 2>/dev/null || true + wait "$SERVER_PID" 2>/dev/null || true + local pid + for pid in $descendants; do + kill -9 "$pid" 2>/dev/null || true + done + local waited=0 + while [[ $waited -lt 120 ]]; do + local used + used=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits 2>/dev/null | sort -rn | head -1) + if [[ -z "$used" || "$used" -lt 2000 ]]; then break; fi + sleep 3; waited=$((waited + 3)) + done + SERVER_PID="" + fi +} +trap 'cleanup_server' EXIT + +start_gpu_monitor + +declare -A AL_RESULT + +run_cell() { + local mode="$1" mtp="$2" + + echo "" + echo "==========================================" + echo " Cell: thinking=$mode MTP=$mtp category=$CATEGORY" + echo "==========================================" + + local serve_args=( + --host 0.0.0.0 --port "$PORT" + "${PARALLEL_ARGS[@]}" + --pipeline-parallel-size 1 + --trust-remote-code + --enable-expert-parallel + --kv-cache-dtype fp8 + --no-enable-prefix-caching + --max-model-len 16384 + --speculative-config "{\"method\": \"mtp\", \"num_speculative_tokens\": $mtp}" + ) + + local server_log="$RESULTS_DIR/server_${mode}_mtp${mtp}.log" + vllm serve "$SERVE_MODEL" "${serve_args[@]}" > "$server_log" 2>&1 & + SERVER_PID=$! + + if ! wait_for_server_ready --port "$PORT" --server-log "$server_log" --server-pid "$SERVER_PID"; then + echo " -> server failed to start (thinking=$mode mtp=$mtp), recording N/A" + AL_RESULT["${mode}_${mtp}"]="N/A" + cleanup_server + return + fi + + local acc_before drf_before acc_after drf_after + acc_before=$(fetch_metric "$PORT" "vllm:spec_decode_num_accepted_tokens_total") + drf_before=$(fetch_metric "$PORT" "vllm:spec_decode_num_drafts_total") + + vllm bench serve \ + --model "$SERVE_MODEL" \ + --port "$PORT" \ + --dataset-name speed_bench \ + --dataset-path "$SPEEDBENCH_DIR" \ + --speed-bench-category "$CATEGORY" \ + --speed-bench-output-len "$SPEEDBENCH_OUTPUT_LEN" \ + --num-prompts -1 \ + --max-concurrency "$CONCURRENCY" \ + --save-result \ + --result-dir "$RESULTS_DIR" \ + --result-filename "speedbench_${mode}_mtp${mtp}" \ + --trust-remote-code \ + --temperature "$TEMPERATURE" \ + --top-p "$TOP_P" + + acc_after=$(fetch_metric "$PORT" "vllm:spec_decode_num_accepted_tokens_total") + drf_after=$(fetch_metric "$PORT" "vllm:spec_decode_num_drafts_total") + + local delta_acc delta_drf al + delta_acc=$(awk "BEGIN {printf \"%d\", $acc_after - $acc_before}") + delta_drf=$(awk "BEGIN {printf \"%d\", $drf_after - $drf_before}") + if [[ "$delta_drf" -gt 0 ]]; then + al=$(awk "BEGIN {printf \"%.2f\", 1 + ($delta_acc / $delta_drf)}") + else + al="N/A" + fi + echo " -> thinking=$mode MTP=$mtp AL=$al (accepted=$delta_acc drafts=$delta_drf)" + AL_RESULT["${mode}_${mtp}"]="$al" + + cleanup_server +} + +for mode in $THINKING_MODES; do + for mtp in $MTP_LIST; do + run_cell "$mode" "$mtp" + done +done + +stop_gpu_monitor + +# ---- Emit the YAML matrix ---- +emit_mode_block() { + local mode="$1" + for mtp in $MTP_LIST; do + echo " $mtp: ${AL_RESULT[${mode}_${mtp}]:-N/A}" + done +} + +{ + echo "# Acceptance Length (AL) reference values measured with SPEED-Bench." + echo "# dataset: $CATEGORY | temperature: $TEMPERATURE | top_p: $TOP_P | output_len: $SPEEDBENCH_OUTPUT_LEN" + echo "# DeepSeek-R1 always reasons (no thinking-off mode), so only thinking_on is emitted." + echo "# Measured on $MODEL_KEY (B300, vLLM MTP), per num_speculative_tokens." + echo "# Auto-generated by benchmarks/single_node/speedbench/dsr1_fp4_b300_vllm.sh (speedbench-al.yml)." + echo "#" + echo "# key = num_speculative_tokens (MTP level); value = golden AL" + echo "${MODEL_KEY}:" + if [[ " $THINKING_MODES " == *" on "* ]]; then + echo " thinking_on:" + emit_mode_block on + fi + if [[ " $THINKING_MODES " == *" off "* ]]; then + echo " thinking_off:" + emit_mode_block off + fi +} > "$OUT_YAML" + +echo "" +echo "==========================================" +echo " SPEED-Bench AL matrix written to: $OUT_YAML" +echo "==========================================" +cat "$OUT_YAML" diff --git a/benchmarks/single_node/speedbench/glm5_fp4_b300_vllm.sh b/benchmarks/single_node/speedbench/glm5_fp4_b300_vllm.sh new file mode 100755 index 000000000..4fb0d114b --- /dev/null +++ b/benchmarks/single_node/speedbench/glm5_fp4_b300_vllm.sh @@ -0,0 +1,359 @@ +#!/usr/bin/env bash + +# GLM-5 B300 vLLM SPEED-Bench AL matrix collector. +# +# Produces the golden acceptance-length (AL) reference matrix consumed by the +# synthetic-acceptance framework: for each thinking mode (on/off) and each MTP +# level (num_speculative_tokens), measure the REAL AL on a single SPEED-Bench +# category (default: coding) and emit a YAML matrix identical in shape to +# benchmarks/speedbench-reference-al.yaml. This measures real MTP acceptance; +# the synthetic value is injected downstream by the throughput recipe, not here. +# +# Filename *_fp4_* matches both the speedbench-al.yml path convention +# (benchmarks/single_node/speedbench/${model-prefix}_fp4_b300_vllm.sh) and the +# served checkpoint: we serve the NVFP4 build (GLM-5-NVFP4), like every model in +# this matrix. The official vLLM GLM recipe only documents FP8, but the B300 runs +# use the NVFP4 checkpoint. +# +# Adapted from speedbench/dsv4_fp4_b300_vllm.sh. Differences vs DSV4 (deepseek_v4 +# is NOT reusable for GLM): +# - reasoning-parser glm45 (was deepseek_v4) +# - tool-call-parser glm47 (was deepseek_v4) +# - --chat-template-content-format=string (GLM requirement per vLLM docs) +# - NO --tokenizer-mode deepseek_v4 (GLM uses the default/auto tokenizer) +# - --attention_config.use_fp4_indexer_cache is NOT passed (and must not be). +# Despite GLM-5 also being DSA sparse attention, that knob is wired ONLY for +# the DeepSeek dsv32 family: it is read solely by vllm/models/deepseek_v4/ +# attention.py and the MLA indexer backend (vllm/v1/attention/backends/mla/ +# indexer.py). GLM's DSA (GlmMoeDsaForCausalLM) is a separate codepath that +# never reads it, so setting it would be a no-op at best or a config error at +# worst. A GLM DSA-indexer OOM would need a GLM-specific option, not this one. +# - thinking on/off uses the enable_thinking chat_template key; thinking is ON +# by default for GLM, so the OFF cell MUST pass enable_thinking:false explicitly +# +# Checkpoint (B300 / Blackwell): NVFP4 build, basename GLM-5-NVFP4. NVIDIA's +# GLM-5-NVFP4 model card serves it with vllm/vllm-openai:latest, and the runner's +# vllm-openai:v0.21.0 (May) is newer than that 3/16 example, so it loads directly. +# For tool calling + MTP together, vLLM docs recommend a recent build. +# +# Usage (inside the GLM vLLM container, on a B300 node): +# export MODEL=/scratch/models/GLM-5-NVFP4 +# bash benchmarks/single_node/speedbench/glm5_fp4_b300_vllm.sh +# +# Tunables (env): +# MTP_LIST space-separated MTP levels (default "1 2 3 4 5 6 7 8") +# THINKING_MODES space-separated: off|on (default "off on") +# CATEGORY SPEED-Bench category (default coding) +# SPEEDBENCH_OUTPUT_LEN per-request output len (default 4096) +# OUT_YAML output matrix path (default $RESULTS_DIR/speedbench-reference-al.yaml) + +set -uo pipefail +source "$(dirname "$0")/../../benchmark_lib.sh" + +MODEL="${MODEL:?MODEL env var required (e.g. /scratch/models/GLM-5-NVFP4)}" +SERVE_MODEL="${MODEL_PATH:-$MODEL}" +TP="${TP:-8}" +DP_ATTENTION="${DP_ATTENTION:-false}" +EP_SIZE="${EP_SIZE:-1}" +PORT="${PORT:-8888}" +# NVIDIA's GLM-5-NVFP4 model card serves with 0.80; NVFP4 + DSA + MTP draft +# layers leave less headroom than DSV4, so match it to avoid startup OOM. +GPU_MEM_UTIL="${GPU_MEM_UTIL:-0.80}" + +MTP_LIST="${MTP_LIST:-1 2 3 4 5 6 7 8}" +THINKING_MODES="${THINKING_MODES:-off on}" +CATEGORY="${CATEGORY:-coding}" +MODEL_KEY="${MODEL_KEY:-$(basename "$SERVE_MODEL" | tr '[:upper:]' '[:lower:]')}" +SPEEDBENCH_OUTPUT_LEN="${SPEEDBENCH_OUTPUT_LEN:-4096}" +CONCURRENCY="${CONCURRENCY:-1}" +# Provider-recommended sampling from the GLM-5 checkpoint generation_config.json +# (temperature 1.0, top_p 0.95). vLLM's own default top_p is 1.0, so it MUST be +# passed explicitly or the measured AL is taken at the wrong sampling settings. +TEMPERATURE="${TEMPERATURE:-1.0}" +TOP_P="${TOP_P:-0.95}" +# GLM thinking toggles via the enable_thinking chat_template key (default ON). +# Use separate single-quoted defaults: an inline ${VAR:-{...}} default whose value +# contains "}" is truncated by bash brace parsing (matches upstream fix #1695). +DEFAULT_CHAT_TEMPLATE_KWARGS_ON='{"enable_thinking": true}' +DEFAULT_CHAT_TEMPLATE_KWARGS_OFF='{"enable_thinking": false}' +CHAT_TEMPLATE_KWARGS_ON="${CHAT_TEMPLATE_KWARGS_ON:-$DEFAULT_CHAT_TEMPLATE_KWARGS_ON}" +CHAT_TEMPLATE_KWARGS_OFF="${CHAT_TEMPLATE_KWARGS_OFF:-$DEFAULT_CHAT_TEMPLATE_KWARGS_OFF}" + +SPEEDBENCH_DIR="${SPEEDBENCH_DIR:-/workspace/speed_bench_data}" +# Flat results dir to match the speedbench-al.yml artifact glob +# (speedbench_results/server_*.log) and its pre-run `rm -rf speedbench_results`. +RESULTS_DIR="${RESULTS_DIR:-/workspace/speedbench_results}" +OUT_YAML="${OUT_YAML:-$RESULTS_DIR/speedbench-reference-al.yaml}" + +export VLLM_ENGINE_READY_TIMEOUT_S=3600 + +mkdir -p "$RESULTS_DIR" +nvidia-smi +if [[ "$SERVE_MODEL" != /* ]]; then hf download "$SERVE_MODEL"; fi + +# ---- Download SPEED-Bench dataset ---- +echo "=== Downloading SPEED-Bench dataset ===" +pip install -q datasets tiktoken +curl -LsSf https://raw.githubusercontent.com/NVIDIA-NeMo/Skills/refs/heads/main/nemo_skills/dataset/speed-bench/prepare.py \ + | python3 - --config qualitative --output_dir "$SPEEDBENCH_DIR" + +if [[ ! -f "$SPEEDBENCH_DIR/qualitative.jsonl" ]]; then + echo "CRITICAL: SPEED-Bench download failed — $SPEEDBENCH_DIR/qualitative.jsonl not found" + exit 1 +fi + +# ---- Temporary shim: add a real --chat-template-kwargs CLI option ---- +# Upstream gap (until vllm-project/vllm#44244 lands): speed_bench/CustomDataset +# pre-renders the chat template client-side WITHOUT chat_template_kwargs and +# posts to /v1/completions, so thinking mode cannot be enabled via --extra-body +# or --default-chat-template-kwargs. This wires a proper --chat-template-kwargs +# option through get_samples into CustomDataset.sample's apply_chat_template. +# Model agnostic (forwards whatever dict it is given). TODO: delete once #44244 +# is released in the benchmark image; idempotent (marker check), safe to leave. +apply_chat_template_kwargs_shim() { + echo "=== Patching vLLM benchmark to add --chat-template-kwargs (temporary shim) ===" + python3 - <<'PYEOF' +import vllm.benchmarks.serve as S +import vllm.benchmarks.datasets.datasets as D + +def patch(mod, edits, marker): + f = mod.__file__ + src = open(f).read() + if marker in src: + print("already patched:", f) + return + for old, new in edits: + n = src.count(old) + assert n == 1, f"anchor matched {n} times in {f}, aborting:\n{old[:80]}..." + src = src.replace(old, new, 1) + open(f, "w").write(src) + print("patched OK ->", f) + +# Edit 1: serve.py -- declare the --chat-template-kwargs argument before --extra-body +serve_old = ''' parser.add_argument( + "--extra-body",''' +serve_new = ''' parser.add_argument( + "--chat-template-kwargs", + type=json.loads, + default=None, + help="JSON dict forwarded to apply_chat_template during " + "client-side prompt rendering, e.g. to enable reasoning mode.", + ) + parser.add_argument( + "--extra-body",''' +patch(S, [(serve_old, serve_new)], marker='"--chat-template-kwargs"') + +# Edit 2: datasets.py -- forward args.chat_template_kwargs into the speed_bench .sample() call +disp_old = ''' output_len=args.speed_bench_output_len, + enable_multimodal_chat=args.enable_multimodal_chat,''' +disp_new = ''' output_len=args.speed_bench_output_len, + chat_template_kwargs=args.chat_template_kwargs, + enable_multimodal_chat=args.enable_multimodal_chat,''' + +# Edit 3: datasets.py -- forward chat_template_kwargs into CustomDataset.sample's template call +samp_old = ''' # apply template + if not skip_chat_template: + prompt = tokenizer.apply_chat_template( + [{"role": "user", "content": prompt}], + add_generation_prompt=True, + tokenize=False, + ) + + prompt_len = len(tokenizer(prompt).input_ids)''' +samp_new = ''' # apply template + if not skip_chat_template: + _ctk = kwargs.get("chat_template_kwargs") or {} + prompt = tokenizer.apply_chat_template( + [{"role": "user", "content": prompt}], + add_generation_prompt=True, + tokenize=False, + **_ctk, + ) + + prompt_len = len(tokenizer(prompt).input_ids)''' +patch(D, [(disp_old, disp_new), (samp_old, samp_new)], + marker="chat_template_kwargs=args.chat_template_kwargs") +PYEOF +} + +# Apply the shim once if any cell will pass chat_template_kwargs. +NEED_SHIM=0 +if [[ " $THINKING_MODES " == *" on "* && -n "$CHAT_TEMPLATE_KWARGS_ON" ]]; then NEED_SHIM=1; fi +if [[ " $THINKING_MODES " == *" off "* && -n "$CHAT_TEMPLATE_KWARGS_OFF" ]]; then NEED_SHIM=1; fi +if [[ "$NEED_SHIM" == "1" ]]; then + if ! apply_chat_template_kwargs_shim; then + echo "CRITICAL: --chat-template-kwargs shim failed — aborting" + exit 1 + fi +fi + +PARALLEL_ARGS=(--tensor-parallel-size "$TP" --data-parallel-size 1) +if [ "${DP_ATTENTION}" = "true" ]; then + PARALLEL_ARGS=(--tensor-parallel-size 1 --data-parallel-size "$TP") +fi +EP_ARGS=() +if [ "${EP_SIZE:-1}" -gt 1 ]; then + EP_ARGS=(--enable-expert-parallel) +fi + +fetch_metric() { + local port="$1" name="$2" + curl -s "http://localhost:${port}/metrics" \ + | grep -oP "${name}\\{[^}]*\\} \\K[0-9.]+" || echo "0" +} + +SERVER_PID="" +_descendants() { + local pid="$1" child + for child in $(pgrep -P "$pid" 2>/dev/null || true); do + echo "$child" + _descendants "$child" + done +} +cleanup_server() { + if [[ -n "$SERVER_PID" ]]; then + local descendants + descendants=$(_descendants "$SERVER_PID") + kill "$SERVER_PID" 2>/dev/null || true + wait "$SERVER_PID" 2>/dev/null || true + local pid + for pid in $descendants; do + kill -9 "$pid" 2>/dev/null || true + done + local waited=0 + while [[ $waited -lt 120 ]]; do + local used + used=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits 2>/dev/null | sort -rn | head -1) + if [[ -z "$used" || "$used" -lt 2000 ]]; then break; fi + sleep 3; waited=$((waited + 3)) + done + SERVER_PID="" + fi +} +trap 'cleanup_server' EXIT + +start_gpu_monitor + +declare -A AL_RESULT + +run_cell() { + local mode="$1" mtp="$2" + local think_args=() + if [[ "$mode" == "on" && -n "$CHAT_TEMPLATE_KWARGS_ON" ]]; then + think_args=(--chat-template-kwargs "$CHAT_TEMPLATE_KWARGS_ON") + elif [[ "$mode" == "off" && -n "$CHAT_TEMPLATE_KWARGS_OFF" ]]; then + think_args=(--chat-template-kwargs "$CHAT_TEMPLATE_KWARGS_OFF") + fi + + echo "" + echo "==========================================" + echo " Cell: thinking=$mode MTP=$mtp category=$CATEGORY" + echo "==========================================" + + local serve_args=( + --host 0.0.0.0 --port "$PORT" + "${PARALLEL_ARGS[@]}" + --pipeline-parallel-size 1 + --kv-cache-dtype fp8 + --trust-remote-code + --no-enable-prefix-caching + "${EP_ARGS[@]}" + --reasoning-parser glm45 + --tool-call-parser glm47 + --enable-auto-tool-choice + --chat-template-content-format=string + --gpu-memory-utilization "$GPU_MEM_UTIL" + --max-model-len 16384 + --speculative-config "{\"method\": \"mtp\", \"num_speculative_tokens\": $mtp}" + ) + + local server_log="$RESULTS_DIR/server_${mode}_mtp${mtp}.log" + vllm serve "$SERVE_MODEL" "${serve_args[@]}" > "$server_log" 2>&1 & + SERVER_PID=$! + + if ! wait_for_server_ready --port "$PORT" --server-log "$server_log" --server-pid "$SERVER_PID"; then + echo " -> server failed to start (thinking=$mode mtp=$mtp), recording N/A" + AL_RESULT["${mode}_${mtp}"]="N/A" + cleanup_server + return + fi + + local acc_before drf_before acc_after drf_after + acc_before=$(fetch_metric "$PORT" "vllm:spec_decode_num_accepted_tokens_total") + drf_before=$(fetch_metric "$PORT" "vllm:spec_decode_num_drafts_total") + + vllm bench serve \ + --model "$SERVE_MODEL" \ + --port "$PORT" \ + --dataset-name speed_bench \ + --dataset-path "$SPEEDBENCH_DIR" \ + --speed-bench-category "$CATEGORY" \ + --speed-bench-output-len "$SPEEDBENCH_OUTPUT_LEN" \ + --num-prompts -1 \ + --max-concurrency "$CONCURRENCY" \ + --save-result \ + --result-dir "$RESULTS_DIR" \ + --result-filename "speedbench_${mode}_mtp${mtp}" \ + --trust-remote-code \ + --temperature "$TEMPERATURE" \ + --top-p "$TOP_P" \ + "${think_args[@]}" + + acc_after=$(fetch_metric "$PORT" "vllm:spec_decode_num_accepted_tokens_total") + drf_after=$(fetch_metric "$PORT" "vllm:spec_decode_num_drafts_total") + + local delta_acc delta_drf al + delta_acc=$(awk "BEGIN {printf \"%d\", $acc_after - $acc_before}") + delta_drf=$(awk "BEGIN {printf \"%d\", $drf_after - $drf_before}") + if [[ "$delta_drf" -gt 0 ]]; then + al=$(awk "BEGIN {printf \"%.2f\", 1 + ($delta_acc / $delta_drf)}") + else + al="N/A" + fi + echo " -> thinking=$mode MTP=$mtp AL=$al (accepted=$delta_acc drafts=$delta_drf)" + AL_RESULT["${mode}_${mtp}"]="$al" + + cleanup_server +} + +for mode in $THINKING_MODES; do + for mtp in $MTP_LIST; do + run_cell "$mode" "$mtp" + done +done + +stop_gpu_monitor + +# ---- Emit the YAML matrix ---- +emit_mode_block() { + local mode="$1" + for mtp in $MTP_LIST; do + echo " $mtp: ${AL_RESULT[${mode}_${mtp}]:-N/A}" + done +} + +{ + echo "# Acceptance Length (AL) reference values measured with SPEED-Bench." + echo "# dataset: $CATEGORY | temperature: $TEMPERATURE | top_p: $TOP_P | output_len: $SPEEDBENCH_OUTPUT_LEN" + echo "# thinking_on chat_template_kwargs: $CHAT_TEMPLATE_KWARGS_ON" + echo "# thinking_off chat_template_kwargs: $CHAT_TEMPLATE_KWARGS_OFF" + echo "# Measured on $MODEL_KEY (B300, vLLM MTP), per num_speculative_tokens." + echo "# Auto-generated by benchmarks/single_node/speedbench/glm5_fp4_b300_vllm.sh (speedbench-al.yml)." + echo "#" + echo "# key = num_speculative_tokens (MTP level); value = golden AL" + echo "${MODEL_KEY}:" + if [[ " $THINKING_MODES " == *" on "* ]]; then + echo " thinking_on:" + emit_mode_block on + fi + if [[ " $THINKING_MODES " == *" off "* ]]; then + echo " thinking_off:" + emit_mode_block off + fi +} > "$OUT_YAML" + +echo "" +echo "==========================================" +echo " SPEED-Bench AL matrix written to: $OUT_YAML" +echo "==========================================" +cat "$OUT_YAML" diff --git a/benchmarks/single_node/speedbench/qwen3.5_fp4_b300_vllm.sh b/benchmarks/single_node/speedbench/qwen3.5_fp4_b300_vllm.sh new file mode 100755 index 000000000..4935337a1 --- /dev/null +++ b/benchmarks/single_node/speedbench/qwen3.5_fp4_b300_vllm.sh @@ -0,0 +1,365 @@ +#!/usr/bin/env bash + +# Qwen3.5-397B-A17B B300 vLLM SPEED-Bench AL matrix collector. +# +# Produces the golden acceptance-length (AL) reference matrix consumed by the +# synthetic-acceptance framework: for each thinking mode (on/off) and each MTP +# level (num_speculative_tokens), measure the REAL AL on a single SPEED-Bench +# category (default: coding) and emit a YAML matrix identical in shape to +# benchmarks/speedbench-reference-al.yaml. This measures real MTP acceptance; +# the synthetic value is injected downstream by the throughput recipe, not here. +# +# Adapted from speedbench/dsv4_fp4_b300_vllm.sh. Differences vs DSV4 (deepseek_v4 +# is NOT reusable for Qwen): +# - reasoning-parser qwen3 (was deepseek_v4) +# - tool-call-parser qwen3_coder (was deepseek_v4) +# - NO --tokenizer-mode deepseek_v4 (Qwen uses the default/auto tokenizer) +# - NO --attention_config.use_fp4_indexer_cache (DSV4 sparse-attn only) +# - --max-cudagraph-capture-size 512 (Qwen3.5 is a mamba hybrid; a large +# capture size trips the causal_conv1d +# assert, see vLLM docs / PR #34571) +# - thinking on/off uses the enable_thinking chat_template key, and OFF is +# passed explicitly (Qwen does not treat "no kwargs" as off the way deepseek does) +# +# Checkpoint (B300 / Blackwell): NVFP4 build, basename Qwen3.5-397B-A17B-NVFP4. +# NVIDIA's Qwen3.5-397B-A17B-NVFP4 model card serves it with vllm/vllm-openai:latest; +# the runner's vllm-openai:v0.21.0 (May) is newer and loads it. +# +# Usage (inside the vLLM container, on a B300 node): +# export MODEL=/scratch/models/Qwen3.5-397B-A17B-NVFP4 +# bash benchmarks/single_node/speedbench/qwen3.5_fp4_b300_vllm.sh +# +# Tunables (env): +# MTP_LIST space-separated MTP levels (default "1 2 3 4 5 6 7 8") +# THINKING_MODES space-separated: off|on (default "off on") +# CATEGORY SPEED-Bench category (default coding) +# SPEEDBENCH_OUTPUT_LEN per-request output len (default 4096) +# OUT_YAML output matrix path (default $RESULTS_DIR/speedbench-reference-al.yaml) + +set -uo pipefail +source "$(dirname "$0")/../../benchmark_lib.sh" + +MODEL="${MODEL:?MODEL env var required (e.g. /scratch/models/Qwen3.5-397B-A17B-NVFP4)}" +SERVE_MODEL="${MODEL_PATH:-$MODEL}" +TP="${TP:-8}" +DP_ATTENTION="${DP_ATTENTION:-false}" +EP_SIZE="${EP_SIZE:-1}" +PORT="${PORT:-8888}" + +MTP_LIST="${MTP_LIST:-1 2 3 4 5 6 7 8}" +THINKING_MODES="${THINKING_MODES:-off on}" +CATEGORY="${CATEGORY:-coding}" +MODEL_KEY="${MODEL_KEY:-$(basename "$SERVE_MODEL" | tr '[:upper:]' '[:lower:]')}" +SPEEDBENCH_OUTPUT_LEN="${SPEEDBENCH_OUTPUT_LEN:-4096}" +CONCURRENCY="${CONCURRENCY:-1}" +# Provider-recommended sampling — DIFFERS by mode (per the Qwen3.5 model card): +# thinking : temperature 0.6, top_p 0.95, top_k 20, presence_penalty 0.0 +# instruct : temperature 0.7, top_p 0.8, top_k 20, presence_penalty 1.5 +# (min_p 0.0 / repetition_penalty 1.0 are vLLM defaults.) These MUST be passed +# per-mode or the measured AL is taken at the wrong sampling settings. +TEMPERATURE_ON="${TEMPERATURE_ON:-0.6}"; TOP_P_ON="${TOP_P_ON:-0.95}"; TOP_K_ON="${TOP_K_ON:-20}"; PRESENCE_PENALTY_ON="${PRESENCE_PENALTY_ON:-0.0}" +TEMPERATURE_OFF="${TEMPERATURE_OFF:-0.7}"; TOP_P_OFF="${TOP_P_OFF:-0.8}"; TOP_K_OFF="${TOP_K_OFF:-20}"; PRESENCE_PENALTY_OFF="${PRESENCE_PENALTY_OFF:-1.5}" +# Optional sampling seed for run-to-run variance checks. Unset -> vLLM default +# (deterministic seed=0); set to different values to measure temperature>0 variance. +SEED="${SEED:-}" +# Optional: also save per-request completions (--save-detailed) to eyeball that +# thinking_on actually emits reasoning and thinking_off does not. Off by +# default (bloats the result JSON with all completions). Set SAVE_DETAILED=1. +SAVE_DETAILED="${SAVE_DETAILED:-}" +# Qwen thinking toggles via the enable_thinking chat_template key. +# Use separate single-quoted defaults: an inline ${VAR:-{...}} default whose value +# contains "}" is truncated by bash brace parsing (matches upstream fix #1695). +DEFAULT_CHAT_TEMPLATE_KWARGS_ON='{"enable_thinking": true}' +DEFAULT_CHAT_TEMPLATE_KWARGS_OFF='{"enable_thinking": false}' +CHAT_TEMPLATE_KWARGS_ON="${CHAT_TEMPLATE_KWARGS_ON:-$DEFAULT_CHAT_TEMPLATE_KWARGS_ON}" +CHAT_TEMPLATE_KWARGS_OFF="${CHAT_TEMPLATE_KWARGS_OFF:-$DEFAULT_CHAT_TEMPLATE_KWARGS_OFF}" + +SPEEDBENCH_DIR="${SPEEDBENCH_DIR:-/workspace/speed_bench_data}" +# Flat results dir to match the speedbench-al.yml artifact glob +# (speedbench_results/server_*.log) and its pre-run `rm -rf speedbench_results`. +RESULTS_DIR="${RESULTS_DIR:-/workspace/speedbench_results}" +OUT_YAML="${OUT_YAML:-$RESULTS_DIR/speedbench-reference-al.yaml}" + +export VLLM_ENGINE_READY_TIMEOUT_S=3600 + +mkdir -p "$RESULTS_DIR" +nvidia-smi +if [[ "$SERVE_MODEL" != /* ]]; then hf download "$SERVE_MODEL"; fi + +# ---- Download SPEED-Bench dataset ---- +echo "=== Downloading SPEED-Bench dataset ===" +pip install -q datasets tiktoken +curl -LsSf https://raw.githubusercontent.com/NVIDIA-NeMo/Skills/refs/heads/main/nemo_skills/dataset/speed-bench/prepare.py \ + | python3 - --config qualitative --output_dir "$SPEEDBENCH_DIR" + +if [[ ! -f "$SPEEDBENCH_DIR/qualitative.jsonl" ]]; then + echo "CRITICAL: SPEED-Bench download failed — $SPEEDBENCH_DIR/qualitative.jsonl not found" + exit 1 +fi + +# ---- Temporary shim: add a real --chat-template-kwargs CLI option ---- +# Upstream gap (until vllm-project/vllm#44244 lands): speed_bench/CustomDataset +# pre-renders the chat template client-side WITHOUT chat_template_kwargs and +# posts to /v1/completions, so thinking mode cannot be enabled via --extra-body +# or --default-chat-template-kwargs. This wires a proper --chat-template-kwargs +# option through get_samples into CustomDataset.sample's apply_chat_template. +# Model agnostic (forwards whatever dict it is given). TODO: delete once #44244 +# is released in the benchmark image; idempotent (marker check), safe to leave. +apply_chat_template_kwargs_shim() { + echo "=== Patching vLLM benchmark to add --chat-template-kwargs (temporary shim) ===" + python3 - <<'PYEOF' +import vllm.benchmarks.serve as S +import vllm.benchmarks.datasets.datasets as D + +def patch(mod, edits, marker): + f = mod.__file__ + src = open(f).read() + if marker in src: + print("already patched:", f) + return + for old, new in edits: + n = src.count(old) + assert n == 1, f"anchor matched {n} times in {f}, aborting:\n{old[:80]}..." + src = src.replace(old, new, 1) + open(f, "w").write(src) + print("patched OK ->", f) + +# Edit 1: serve.py -- declare the --chat-template-kwargs argument before --extra-body +serve_old = ''' parser.add_argument( + "--extra-body",''' +serve_new = ''' parser.add_argument( + "--chat-template-kwargs", + type=json.loads, + default=None, + help="JSON dict forwarded to apply_chat_template during " + "client-side prompt rendering, e.g. to enable reasoning mode.", + ) + parser.add_argument( + "--extra-body",''' +patch(S, [(serve_old, serve_new)], marker='"--chat-template-kwargs"') + +# Edit 2: datasets.py -- forward args.chat_template_kwargs into the speed_bench .sample() call +disp_old = ''' output_len=args.speed_bench_output_len, + enable_multimodal_chat=args.enable_multimodal_chat,''' +disp_new = ''' output_len=args.speed_bench_output_len, + chat_template_kwargs=args.chat_template_kwargs, + enable_multimodal_chat=args.enable_multimodal_chat,''' + +# Edit 3: datasets.py -- forward chat_template_kwargs into CustomDataset.sample's template call +samp_old = ''' # apply template + if not skip_chat_template: + prompt = tokenizer.apply_chat_template( + [{"role": "user", "content": prompt}], + add_generation_prompt=True, + tokenize=False, + ) + + prompt_len = len(tokenizer(prompt).input_ids)''' +samp_new = ''' # apply template + if not skip_chat_template: + _ctk = kwargs.get("chat_template_kwargs") or {} + prompt = tokenizer.apply_chat_template( + [{"role": "user", "content": prompt}], + add_generation_prompt=True, + tokenize=False, + **_ctk, + ) + + prompt_len = len(tokenizer(prompt).input_ids)''' +patch(D, [(disp_old, disp_new), (samp_old, samp_new)], + marker="chat_template_kwargs=args.chat_template_kwargs") +PYEOF +} + +# Apply the shim once if any cell will pass chat_template_kwargs. +NEED_SHIM=0 +if [[ " $THINKING_MODES " == *" on "* && -n "$CHAT_TEMPLATE_KWARGS_ON" ]]; then NEED_SHIM=1; fi +if [[ " $THINKING_MODES " == *" off "* && -n "$CHAT_TEMPLATE_KWARGS_OFF" ]]; then NEED_SHIM=1; fi +if [[ "$NEED_SHIM" == "1" ]]; then + if ! apply_chat_template_kwargs_shim; then + echo "CRITICAL: --chat-template-kwargs shim failed — aborting" + exit 1 + fi +fi + +PARALLEL_ARGS=(--tensor-parallel-size "$TP" --data-parallel-size 1) +if [ "${DP_ATTENTION}" = "true" ]; then + PARALLEL_ARGS=(--tensor-parallel-size 1 --data-parallel-size "$TP") +fi +EP_ARGS=() +if [ "${EP_SIZE:-1}" -gt 1 ]; then + EP_ARGS=(--enable-expert-parallel) +fi + +fetch_metric() { + local port="$1" name="$2" + curl -s "http://localhost:${port}/metrics" \ + | grep -oP "${name}\\{[^}]*\\} \\K[0-9.]+" || echo "0" +} + +SERVER_PID="" +_descendants() { + local pid="$1" child + for child in $(pgrep -P "$pid" 2>/dev/null || true); do + echo "$child" + _descendants "$child" + done +} +cleanup_server() { + if [[ -n "$SERVER_PID" ]]; then + local descendants + descendants=$(_descendants "$SERVER_PID") + kill "$SERVER_PID" 2>/dev/null || true + wait "$SERVER_PID" 2>/dev/null || true + local pid + for pid in $descendants; do + kill -9 "$pid" 2>/dev/null || true + done + local waited=0 + while [[ $waited -lt 120 ]]; do + local used + used=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits 2>/dev/null | sort -rn | head -1) + if [[ -z "$used" || "$used" -lt 2000 ]]; then break; fi + sleep 3; waited=$((waited + 3)) + done + SERVER_PID="" + fi +} +trap 'cleanup_server' EXIT + +start_gpu_monitor + +declare -A AL_RESULT + +run_cell() { + local mode="$1" mtp="$2" + local think_args=() + local temp top_p top_k pp + if [[ "$mode" == "on" ]]; then + [[ -n "$CHAT_TEMPLATE_KWARGS_ON" ]] && think_args=(--chat-template-kwargs "$CHAT_TEMPLATE_KWARGS_ON") + temp="$TEMPERATURE_ON"; top_p="$TOP_P_ON"; top_k="$TOP_K_ON"; pp="$PRESENCE_PENALTY_ON" + else + [[ -n "$CHAT_TEMPLATE_KWARGS_OFF" ]] && think_args=(--chat-template-kwargs "$CHAT_TEMPLATE_KWARGS_OFF") + temp="$TEMPERATURE_OFF"; top_p="$TOP_P_OFF"; top_k="$TOP_K_OFF"; pp="$PRESENCE_PENALTY_OFF" + fi + local seed_args=() + [[ -n "$SEED" ]] && seed_args=(--seed "$SEED") + local detail_args=() + [[ -n "$SAVE_DETAILED" ]] && detail_args=(--save-detailed) + + echo "" + echo "==========================================" + echo " Cell: thinking=$mode MTP=$mtp category=$CATEGORY" + echo "==========================================" + + local serve_args=( + --host 0.0.0.0 --port "$PORT" + "${PARALLEL_ARGS[@]}" + --pipeline-parallel-size 1 + --kv-cache-dtype fp8 + --trust-remote-code + --no-enable-prefix-caching + "${EP_ARGS[@]}" + --reasoning-parser qwen3 + --tool-call-parser qwen3_coder + --enable-auto-tool-choice + --language-model-only + --max-cudagraph-capture-size 512 + --max-model-len 16384 + --speculative-config "{\"method\": \"mtp\", \"num_speculative_tokens\": $mtp}" + ) + + local server_log="$RESULTS_DIR/server_${mode}_mtp${mtp}.log" + vllm serve "$SERVE_MODEL" "${serve_args[@]}" > "$server_log" 2>&1 & + SERVER_PID=$! + + if ! wait_for_server_ready --port "$PORT" --server-log "$server_log" --server-pid "$SERVER_PID"; then + echo " -> server failed to start (thinking=$mode mtp=$mtp), recording N/A" + AL_RESULT["${mode}_${mtp}"]="N/A" + cleanup_server + return + fi + + local acc_before drf_before acc_after drf_after + acc_before=$(fetch_metric "$PORT" "vllm:spec_decode_num_accepted_tokens_total") + drf_before=$(fetch_metric "$PORT" "vllm:spec_decode_num_drafts_total") + + vllm bench serve \ + --model "$SERVE_MODEL" \ + --port "$PORT" \ + --dataset-name speed_bench \ + --dataset-path "$SPEEDBENCH_DIR" \ + --speed-bench-category "$CATEGORY" \ + --speed-bench-output-len "$SPEEDBENCH_OUTPUT_LEN" \ + --num-prompts -1 \ + --max-concurrency "$CONCURRENCY" \ + --save-result \ + --result-dir "$RESULTS_DIR" \ + --result-filename "speedbench_${mode}_mtp${mtp}" \ + --trust-remote-code \ + --temperature "$temp" \ + --top-p "$top_p" \ + --top-k "$top_k" \ + --presence-penalty "$pp" \ + "${seed_args[@]}" \ + "${detail_args[@]}" \ + "${think_args[@]}" + + acc_after=$(fetch_metric "$PORT" "vllm:spec_decode_num_accepted_tokens_total") + drf_after=$(fetch_metric "$PORT" "vllm:spec_decode_num_drafts_total") + + local delta_acc delta_drf al + delta_acc=$(awk "BEGIN {printf \"%d\", $acc_after - $acc_before}") + delta_drf=$(awk "BEGIN {printf \"%d\", $drf_after - $drf_before}") + if [[ "$delta_drf" -gt 0 ]]; then + al=$(awk "BEGIN {printf \"%.2f\", 1 + ($delta_acc / $delta_drf)}") + else + al="N/A" + fi + echo " -> thinking=$mode MTP=$mtp AL=$al (accepted=$delta_acc drafts=$delta_drf)" + AL_RESULT["${mode}_${mtp}"]="$al" + + cleanup_server +} + +for mode in $THINKING_MODES; do + for mtp in $MTP_LIST; do + run_cell "$mode" "$mtp" + done +done + +stop_gpu_monitor + +# ---- Emit the YAML matrix ---- +emit_mode_block() { + local mode="$1" + for mtp in $MTP_LIST; do + echo " $mtp: ${AL_RESULT[${mode}_${mtp}]:-N/A}" + done +} + +{ + echo "# Acceptance Length (AL) reference values measured with SPEED-Bench." + echo "# dataset: $CATEGORY | output_len: $SPEEDBENCH_OUTPUT_LEN" + echo "# thinking_on : temp $TEMPERATURE_ON top_p $TOP_P_ON top_k $TOP_K_ON presence_penalty $PRESENCE_PENALTY_ON | chat_template_kwargs: $CHAT_TEMPLATE_KWARGS_ON" + echo "# thinking_off: temp $TEMPERATURE_OFF top_p $TOP_P_OFF top_k $TOP_K_OFF presence_penalty $PRESENCE_PENALTY_OFF | chat_template_kwargs: $CHAT_TEMPLATE_KWARGS_OFF" + echo "# Measured on $MODEL_KEY (B300, vLLM MTP), per num_speculative_tokens." + echo "# Auto-generated by benchmarks/single_node/speedbench/qwen3.5_fp4_b300_vllm.sh (speedbench-al.yml)." + echo "#" + echo "# key = num_speculative_tokens (MTP level); value = golden AL" + echo "${MODEL_KEY}:" + if [[ " $THINKING_MODES " == *" on "* ]]; then + echo " thinking_on:" + emit_mode_block on + fi + if [[ " $THINKING_MODES " == *" off "* ]]; then + echo " thinking_off:" + emit_mode_block off + fi +} > "$OUT_YAML" + +echo "" +echo "==========================================" +echo " SPEED-Bench AL matrix written to: $OUT_YAML" +echo "==========================================" +cat "$OUT_YAML" From ec187d02b3928b9f0dcad76ec51ec9f5c9a176bc Mon Sep 17 00:00:00 2001 From: root Date: Thu, 11 Jun 2026 11:31:07 -0700 Subject: [PATCH 2/2] [NV] Log per-request benchmark output for AL auditability Add `--save-detailed` to the SpeedBench AL collectors (dsv4/dsr1/glm5/ qwen3.5) so vllm bench serve records each request's response text, and upload speedbench_results/speedbench_* as a workflow artifact. This lets each cell's measured AL be audited for output correctness (sensible text + correct thinking mode), per review feedback. No behavior change: the flag only adds fields to the already-saved result JSON. --- .github/workflows/speedbench-al.yml | 11 +++++++++++ .../single_node/speedbench/dsr1_fp4_b300_vllm.sh | 1 + .../single_node/speedbench/dsv4_fp4_b300_vllm.sh | 1 + .../single_node/speedbench/glm5_fp4_b300_vllm.sh | 1 + .../single_node/speedbench/qwen3.5_fp4_b300_vllm.sh | 1 + 5 files changed, 15 insertions(+) diff --git a/.github/workflows/speedbench-al.yml b/.github/workflows/speedbench-al.yml index 7c24660c2..84a92e57a 100644 --- a/.github/workflows/speedbench-al.yml +++ b/.github/workflows/speedbench-al.yml @@ -209,6 +209,17 @@ jobs: path: speedbench_results/server_*.log if-no-files-found: ignore + # Per-request benchmark detail (vllm bench serve --save-detailed): includes + # the model's response text per request, so the AL of each cell can be + # audited for output correctness (sensible text + correct thinking mode). + - name: Upload detailed benchmark results + if: always() + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: speedbench_detailed_results-${{ inputs.model-prefix }} + path: speedbench_results/speedbench_* + if-no-files-found: ignore + - name: Resource cleanup (post-run) if: always() run: *resource-cleanup \ No newline at end of file diff --git a/benchmarks/single_node/speedbench/dsr1_fp4_b300_vllm.sh b/benchmarks/single_node/speedbench/dsr1_fp4_b300_vllm.sh index aee0bb967..d0357c6b4 100755 --- a/benchmarks/single_node/speedbench/dsr1_fp4_b300_vllm.sh +++ b/benchmarks/single_node/speedbench/dsr1_fp4_b300_vllm.sh @@ -177,6 +177,7 @@ run_cell() { --num-prompts -1 \ --max-concurrency "$CONCURRENCY" \ --save-result \ + --save-detailed \ --result-dir "$RESULTS_DIR" \ --result-filename "speedbench_${mode}_mtp${mtp}" \ --trust-remote-code \ diff --git a/benchmarks/single_node/speedbench/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/speedbench/dsv4_fp4_b300_vllm.sh index 2a77dcb36..b8550a350 100755 --- a/benchmarks/single_node/speedbench/dsv4_fp4_b300_vllm.sh +++ b/benchmarks/single_node/speedbench/dsv4_fp4_b300_vllm.sh @@ -275,6 +275,7 @@ run_cell() { --num-prompts -1 \ --max-concurrency "$CONCURRENCY" \ --save-result \ + --save-detailed \ --result-dir "$RESULTS_DIR" \ --result-filename "speedbench_${mode}_mtp${mtp}" \ --trust-remote-code \ diff --git a/benchmarks/single_node/speedbench/glm5_fp4_b300_vllm.sh b/benchmarks/single_node/speedbench/glm5_fp4_b300_vllm.sh index 4fb0d114b..6265500b9 100755 --- a/benchmarks/single_node/speedbench/glm5_fp4_b300_vllm.sh +++ b/benchmarks/single_node/speedbench/glm5_fp4_b300_vllm.sh @@ -292,6 +292,7 @@ run_cell() { --num-prompts -1 \ --max-concurrency "$CONCURRENCY" \ --save-result \ + --save-detailed \ --result-dir "$RESULTS_DIR" \ --result-filename "speedbench_${mode}_mtp${mtp}" \ --trust-remote-code \ diff --git a/benchmarks/single_node/speedbench/qwen3.5_fp4_b300_vllm.sh b/benchmarks/single_node/speedbench/qwen3.5_fp4_b300_vllm.sh index 4935337a1..bf2bda7c8 100755 --- a/benchmarks/single_node/speedbench/qwen3.5_fp4_b300_vllm.sh +++ b/benchmarks/single_node/speedbench/qwen3.5_fp4_b300_vllm.sh @@ -294,6 +294,7 @@ run_cell() { --num-prompts -1 \ --max-concurrency "$CONCURRENCY" \ --save-result \ + --save-detailed \ --result-dir "$RESULTS_DIR" \ --result-filename "speedbench_${mode}_mtp${mtp}" \ --trust-remote-code \