Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11650,3 +11650,29 @@ minimaxm2.5-fp8-gb300-dynamo-vllm:
tp: 4
ep: 4
dp-attn: true

qwen3.5-fp4-b200-trt:
image: nvcr.io#nvidia/tensorrt-llm/release:1.3.0rc18
model: nvidia/Qwen3.5-397B-A17B-NVFP4
model-prefix: qwen3.5
runner: b200
precision: fp4
framework: trt
multinode: false
scenarios:
fixed-seq-len:
- isl: 1024
osl: 1024
search-space:
- { tp: 4, ep: 1, conc-list: [8, 16] }
- { tp: 4, ep: 4, conc-list: [64, 128] }
- { tp: 8, ep: 8, conc-list: [4, 64] }
- { tp: 8, ep: 8, dp-attn: true, conc-list: [512, 1024] }
- isl: 8192
osl: 1024
search-space:
- { tp: 2, ep: 1, conc-list: [4, 16] }
- { tp: 4, ep: 1, conc-list: [4] }
- { tp: 2, ep: 2, conc-list: [8, 32] }
- { tp: 8, ep: 8, conc-list: [4] }
- { tp: 8, ep: 8, dp-attn: true, conc-list: [256, 512, 1024] }

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

conc-list breaks full-sweep

High Severity

The new qwen3.5-fp4-b200-trt fixed-seq-len search space uses conc-list, but single-node full-sweep generation in utils/matrix_logic/generate_sweep_configs.py only reads conc-start and conc-end for that scenario. Matrix generation will raise a missing-key error and no benchmark jobs will be emitted for this config.

Fix in Cursor Fix in Web

Reviewed by Cursor Bugbot for commit 57c34e4. Configure here.

123 changes: 123 additions & 0 deletions benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_b200_trt.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
#!/usr/bin/env bash

source "$(dirname "$0")/../../benchmark_lib.sh"

check_env_vars \
MODEL \
TP \
CONC \
ISL \
OSL \
MAX_MODEL_LEN \
RANDOM_RANGE_RATIO \
RESULT_FILENAME \
DP_ATTENTION \
EP_SIZE

if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
fi

echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION"

if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi

nvidia-smi

SERVER_LOG=/workspace/server.log
EXTRA_CONFIG_FILE="qwen3.5-fp4-trt.yml"

MAX_BATCH_SIZE=$(( CONC > 16 ? CONC : 16 ))

if [[ "$DP_ATTENTION" == "true" ]]; then
MOE_BACKEND="CUTEDSL"
MODE_CONFIG="attention_dp_config:
enable_balance: true
batching_wait_iters: 10
timeout_iters: 500"
else
MOE_BACKEND="TRTLLM"
MODE_CONFIG="batch_wait_timeout_iters: 50
batch_wait_max_tokens_ratio: 0.45"
fi

cat > "$EXTRA_CONFIG_FILE" << EOF
backend: pytorch
print_iter_log: true
enable_layerwise_nvtx_marker: false
disable_overlap_scheduler: false
enable_iter_perf_stats: true
enable_chunked_prefill: false
stream_interval: 20
num_postprocess_workers: 4
enable_attention_dp: $DP_ATTENTION
scheduler_config:
capacity_scheduler_policy: MAX_UTILIZATION
context_chunking_policy: FIRST_COME_FIRST_SERVED
kv_cache_config:
free_gpu_memory_fraction: 0.9
enable_block_reuse: false
dtype: fp8
cuda_graph_config:
enable_padding: true
max_batch_size: $MAX_BATCH_SIZE
moe_config:
backend: $MOE_BACKEND
use_low_precision_moe_combine: true
$MODE_CONFIG
EOF

echo "Generated config file contents:"
cat "$EXTRA_CONFIG_FILE"

MAX_MODEL_LEN=$(( MAX_MODEL_LEN > 8192 ? MAX_MODEL_LEN : 8192 ))
MAX_NUM_TOKENS=$(( ISL + OSL + 256 ))
MAX_NUM_TOKENS=$(( MAX_NUM_TOKENS > 8192 ? MAX_NUM_TOKENS : 8192 ))

if [ "${EVAL_ONLY}" = "true" ]; then
setup_eval_context
MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
MAX_NUM_TOKENS="$EVAL_MAX_MODEL_LEN"
fi

# Start GPU monitoring (power, temperature, clocks every second)
start_gpu_monitor

set -x
mpirun -n 1 --oversubscribe --allow-run-as-root \
trtllm-serve "$MODEL" --port="$PORT" \
--trust_remote_code \
--backend=pytorch \
--max_batch_size "$MAX_BATCH_SIZE" \
--max_seq_len="$MAX_MODEL_LEN" \
--max_num_tokens="$MAX_NUM_TOKENS" \
--tp_size="$TP" --ep_size="$EP_SIZE" \
--extra_llm_api_options="$EXTRA_CONFIG_FILE" \
> "$SERVER_LOG" 2>&1 &

SERVER_PID=$!

# Wait for server to be ready
wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"

run_benchmark_serving \
--model "$MODEL" \
--port "$PORT" \
--backend openai \
--input-len "$ISL" \
--output-len "$OSL" \
--random-range-ratio "$RANDOM_RANGE_RATIO" \
--num-prompts "$(( CONC * 10 ))" \
--max-concurrency "$CONC" \
--result-filename "$RESULT_FILENAME" \
--result-dir /workspace/

# After throughput, run evaluation only if RUN_EVAL is true
if [ "${RUN_EVAL}" = "true" ]; then
run_eval --framework lm-eval --port "$PORT"
append_lm_eval_summary
fi

# Stop GPU monitoring
stop_gpu_monitor
set +x
6 changes: 6 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3609,3 +3609,9 @@
- "8 topologies sweeping low-latency (1p1d-tp8-tp8) through max throughput (6p1d-dep8-dep12)."
- "Updated 1p1d and 2p1d configs to match https://github.com/shyeh25/srt-slurm/commit/ede724d7cc9a780be5b84659f599733bf9fd0097"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1675

- config-keys:
- qwen3.5-fp4-b200-trt
description:
- "Add Qwen3.5-397B-A17B-NVFP4 B200 single-node TensorRT-LLM benchmark (1k/1k and 8k/1k) with a TP/TEP/DEP parallelism sweep"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1711