Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions .github/workflows/speedbench-al.yml
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,17 @@ jobs:
path: speedbench_results/server_*.log
if-no-files-found: ignore

# Per-request benchmark detail (vllm bench serve --save-detailed): includes
# the model's response text per request, so the AL of each cell can be
# audited for output correctness (sensible text + correct thinking mode).
- name: Upload detailed benchmark results
if: always()
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
with:
name: speedbench_detailed_results-${{ inputs.model-prefix }}
path: speedbench_results/speedbench_*
if-no-files-found: ignore

- name: Resource cleanup (post-run)
if: always()
run: *resource-cleanup
243 changes: 243 additions & 0 deletions benchmarks/single_node/speedbench/dsr1_fp4_b300_vllm.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,243 @@
#!/usr/bin/env bash

# DeepSeek-R1 B300 vLLM SPEED-Bench AL matrix collector.
#
# Produces the golden acceptance-length (AL) reference matrix consumed by the
# synthetic-acceptance framework: for each MTP level (num_speculative_tokens),
# measure the REAL AL on a single SPEED-Bench category (default: coding) and emit
# a YAML matrix identical in shape to benchmarks/speedbench-reference-al.yaml.
# This measures real MTP acceptance; the synthetic value is injected downstream
# by the throughput recipe, not here.
#
# Adapted from speedbench/dsv4_fp4_b300_vllm.sh. DeepSeek-R1 is DeepSeek-V3
# architecture (MLA dense attention), NOT V4 (DSA / Lightning Indexer), so vs the
# DSV4 collector:
# - NO --tokenizer-mode deepseek_v4 / --reasoning-parser deepseek_v4 /
# --tool-call-parser deepseek_v4 (all V4-specific; the official vLLM R1
# serve command is bare). reasoning-parser is irrelevant here anyway: AL is
# read from /metrics, not from parsed output.
# - NO --attention_config.use_fp4_indexer_cache (that knob is dsv32/MLA-indexer
# only; R1 is plain MLA and never reads it).
# - NO --block-size / --compilation-config (the official R1 recipe omits them;
# defaults apply). --kv-cache-dtype fp8 IS kept, to match the dsv4/qwen/glm
# collectors so all golden AL values share one kv-cache numeric regime.
# - FP4 on Blackwell needs FlashInfer MoE: export VLLM_USE_FLASHINFER_MOE_FP4=1.
# - THINKING: R1 is a pure reasoning model and always emits <think> (its chat
# template has no enable_thinking toggle). There is no thinking-off mode, so
# this collector measures thinking_on only and needs no --chat-template-kwargs
# shim (the default client-side template render already enables thinking).
#
# Checkpoint (B300 / Blackwell): NVFP4 build nvidia/DeepSeek-R1-0528-NVFP4-v2,
# basename dsr1-fp4 on the runner (resolved by launch_b300-nv.sh).
#
# Usage (inside the vLLM container, on a B300 node):
# export MODEL=/data/models/dsr1-fp4
# bash benchmarks/single_node/speedbench/dsr1_fp4_b300_vllm.sh
#
# Tunables (env):
# MTP_LIST space-separated MTP levels (default "1 2 3 4 5 6 7 8")
# THINKING_MODES space-separated: on (default "on"; R1 has no off)
# CATEGORY SPEED-Bench category (default coding)
# SPEEDBENCH_OUTPUT_LEN per-request output len (default 4096)
# OUT_YAML output matrix path (default $RESULTS_DIR/speedbench-reference-al.yaml)

set -uo pipefail
source "$(dirname "$0")/../../benchmark_lib.sh"

MODEL="${MODEL:?MODEL env var required (e.g. /data/models/dsr1-fp4)}"
SERVE_MODEL="${MODEL_PATH:-$MODEL}"
TP="${TP:-8}"
DP_ATTENTION="${DP_ATTENTION:-false}"
PORT="${PORT:-8888}"

MTP_LIST="${MTP_LIST:-1 2 3 4 5 6 7 8}"
THINKING_MODES="${THINKING_MODES:-on}"
CATEGORY="${CATEGORY:-coding}"
MODEL_KEY="${MODEL_KEY:-$(basename "$SERVE_MODEL" | tr '[:upper:]' '[:lower:]')}"
SPEEDBENCH_OUTPUT_LEN="${SPEEDBENCH_OUTPUT_LEN:-4096}"
CONCURRENCY="${CONCURRENCY:-1}"
# Provider-recommended sampling from the DeepSeek-R1 checkpoint generation_config
# (temperature 0.6, top_p 0.95; no top_k). vLLM's own default top_p is 1.0, so it
# MUST be passed explicitly or the measured AL is taken at the wrong settings.
TEMPERATURE="${TEMPERATURE:-0.6}"
TOP_P="${TOP_P:-0.95}"

SPEEDBENCH_DIR="${SPEEDBENCH_DIR:-/workspace/speed_bench_data}"
# Flat results dir to match the speedbench-al.yml artifact glob
# (speedbench_results/server_*.log) and its pre-run `rm -rf speedbench_results`.
RESULTS_DIR="${RESULTS_DIR:-/workspace/speedbench_results}"
OUT_YAML="${OUT_YAML:-$RESULTS_DIR/speedbench-reference-al.yaml}"

# Blackwell FP4 MoE path (DeepSeek-R1 FP4 on B-series): required per vLLM R1 docs.
export VLLM_USE_FLASHINFER_MOE_FP4="${VLLM_USE_FLASHINFER_MOE_FP4:-1}"
export VLLM_ENGINE_READY_TIMEOUT_S=3600

mkdir -p "$RESULTS_DIR"
nvidia-smi
if [[ "$SERVE_MODEL" != /* ]]; then hf download "$SERVE_MODEL"; fi

# ---- Download SPEED-Bench dataset ----
echo "=== Downloading SPEED-Bench dataset ==="
pip install -q datasets tiktoken
curl -LsSf https://raw.githubusercontent.com/NVIDIA-NeMo/Skills/refs/heads/main/nemo_skills/dataset/speed-bench/prepare.py \
| python3 - --config qualitative --output_dir "$SPEEDBENCH_DIR"

if [[ ! -f "$SPEEDBENCH_DIR/qualitative.jsonl" ]]; then
echo "CRITICAL: SPEED-Bench download failed — $SPEEDBENCH_DIR/qualitative.jsonl not found"
exit 1
fi

PARALLEL_ARGS=(--tensor-parallel-size "$TP" --data-parallel-size 1)
if [ "${DP_ATTENTION}" = "true" ]; then
PARALLEL_ARGS=(--tensor-parallel-size 1 --data-parallel-size "$TP")
fi

fetch_metric() {
local port="$1" name="$2"
curl -s "http://localhost:${port}/metrics" \
| grep -oP "${name}\\{[^}]*\\} \\K[0-9.]+" || echo "0"
}

SERVER_PID=""
_descendants() {
local pid="$1" child
for child in $(pgrep -P "$pid" 2>/dev/null || true); do
echo "$child"
_descendants "$child"
done
}
cleanup_server() {
if [[ -n "$SERVER_PID" ]]; then
local descendants
descendants=$(_descendants "$SERVER_PID")
kill "$SERVER_PID" 2>/dev/null || true
wait "$SERVER_PID" 2>/dev/null || true
local pid
for pid in $descendants; do
kill -9 "$pid" 2>/dev/null || true
done
local waited=0
while [[ $waited -lt 120 ]]; do
local used
used=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits 2>/dev/null | sort -rn | head -1)
if [[ -z "$used" || "$used" -lt 2000 ]]; then break; fi
sleep 3; waited=$((waited + 3))
done
SERVER_PID=""
fi
}
trap 'cleanup_server' EXIT

start_gpu_monitor

declare -A AL_RESULT

run_cell() {
local mode="$1" mtp="$2"

echo ""
echo "=========================================="
echo " Cell: thinking=$mode MTP=$mtp category=$CATEGORY"
echo "=========================================="

local serve_args=(
--host 0.0.0.0 --port "$PORT"
"${PARALLEL_ARGS[@]}"
--pipeline-parallel-size 1
--trust-remote-code
--enable-expert-parallel
--kv-cache-dtype fp8
--no-enable-prefix-caching
--max-model-len 16384
--speculative-config "{\"method\": \"mtp\", \"num_speculative_tokens\": $mtp}"
)

local server_log="$RESULTS_DIR/server_${mode}_mtp${mtp}.log"
vllm serve "$SERVE_MODEL" "${serve_args[@]}" > "$server_log" 2>&1 &
SERVER_PID=$!

if ! wait_for_server_ready --port "$PORT" --server-log "$server_log" --server-pid "$SERVER_PID"; then
echo " -> server failed to start (thinking=$mode mtp=$mtp), recording N/A"
AL_RESULT["${mode}_${mtp}"]="N/A"
cleanup_server
return
fi

local acc_before drf_before acc_after drf_after
acc_before=$(fetch_metric "$PORT" "vllm:spec_decode_num_accepted_tokens_total")
drf_before=$(fetch_metric "$PORT" "vllm:spec_decode_num_drafts_total")

vllm bench serve \
--model "$SERVE_MODEL" \
--port "$PORT" \
--dataset-name speed_bench \
--dataset-path "$SPEEDBENCH_DIR" \
--speed-bench-category "$CATEGORY" \
--speed-bench-output-len "$SPEEDBENCH_OUTPUT_LEN" \
--num-prompts -1 \
--max-concurrency "$CONCURRENCY" \
--save-result \
--save-detailed \
--result-dir "$RESULTS_DIR" \
--result-filename "speedbench_${mode}_mtp${mtp}" \
--trust-remote-code \
--temperature "$TEMPERATURE" \
--top-p "$TOP_P"

acc_after=$(fetch_metric "$PORT" "vllm:spec_decode_num_accepted_tokens_total")
drf_after=$(fetch_metric "$PORT" "vllm:spec_decode_num_drafts_total")

local delta_acc delta_drf al
delta_acc=$(awk "BEGIN {printf \"%d\", $acc_after - $acc_before}")
delta_drf=$(awk "BEGIN {printf \"%d\", $drf_after - $drf_before}")
if [[ "$delta_drf" -gt 0 ]]; then
al=$(awk "BEGIN {printf \"%.2f\", 1 + ($delta_acc / $delta_drf)}")
else
al="N/A"
fi
echo " -> thinking=$mode MTP=$mtp AL=$al (accepted=$delta_acc drafts=$delta_drf)"
AL_RESULT["${mode}_${mtp}"]="$al"

cleanup_server
}

for mode in $THINKING_MODES; do
for mtp in $MTP_LIST; do
run_cell "$mode" "$mtp"
done
done

stop_gpu_monitor

# ---- Emit the YAML matrix ----
emit_mode_block() {
local mode="$1"
for mtp in $MTP_LIST; do
echo " $mtp: ${AL_RESULT[${mode}_${mtp}]:-N/A}"
done
}

{
echo "# Acceptance Length (AL) reference values measured with SPEED-Bench."
echo "# dataset: $CATEGORY | temperature: $TEMPERATURE | top_p: $TOP_P | output_len: $SPEEDBENCH_OUTPUT_LEN"
echo "# DeepSeek-R1 always reasons (no thinking-off mode), so only thinking_on is emitted."
echo "# Measured on $MODEL_KEY (B300, vLLM MTP), per num_speculative_tokens."
echo "# Auto-generated by benchmarks/single_node/speedbench/dsr1_fp4_b300_vllm.sh (speedbench-al.yml)."
echo "#"
echo "# key = num_speculative_tokens (MTP level); value = golden AL"
echo "${MODEL_KEY}:"
if [[ " $THINKING_MODES " == *" on "* ]]; then
echo " thinking_on:"
emit_mode_block on
fi
if [[ " $THINKING_MODES " == *" off "* ]]; then
echo " thinking_off:"
emit_mode_block off
fi
} > "$OUT_YAML"

echo ""
echo "=========================================="
echo " SPEED-Bench AL matrix written to: $OUT_YAML"
echo "=========================================="
cat "$OUT_YAML"
1 change: 1 addition & 0 deletions benchmarks/single_node/speedbench/dsv4_fp4_b300_vllm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,7 @@ run_cell() {
--num-prompts -1 \
--max-concurrency "$CONCURRENCY" \
--save-result \
--save-detailed \
--result-dir "$RESULTS_DIR" \
--result-filename "speedbench_${mode}_mtp${mtp}" \
--trust-remote-code \
Expand Down
Loading
Loading