diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index d22ae3229..7b25f44b1 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -11678,3 +11678,29 @@ minimaxm2.5-fp8-gb300-dynamo-vllm: tp: 4 ep: 4 dp-attn: true + +qwen3.5-fp4-b200-trt: + image: nvcr.io#nvidia/tensorrt-llm/release:1.3.0rc18 + model: nvidia/Qwen3.5-397B-A17B-NVFP4 + model-prefix: qwen3.5 + runner: b200 + precision: fp4 + framework: trt + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, ep: 1, conc-list: [8, 16] } + - { tp: 4, ep: 4, conc-list: [64, 128] } + - { tp: 8, ep: 8, conc-list: [4, 64] } + - { tp: 8, ep: 8, dp-attn: true, conc-list: [512, 1024] } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 2, ep: 1, conc-list: [4, 16] } + - { tp: 4, ep: 1, conc-list: [4] } + - { tp: 2, ep: 2, conc-list: [8, 32] } + - { tp: 8, ep: 8, conc-list: [4] } + - { tp: 8, ep: 8, dp-attn: true, conc-list: [256, 512, 1024] } diff --git a/benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_b200_trt.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_b200_trt.sh new file mode 100644 index 000000000..cb7dee6a9 --- /dev/null +++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_b200_trt.sh @@ -0,0 +1,123 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + MAX_MODEL_LEN \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME \ + DP_ATTENTION \ + EP_SIZE + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION" + +if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi + +nvidia-smi + +SERVER_LOG=/workspace/server.log +EXTRA_CONFIG_FILE="qwen3.5-fp4-trt.yml" + +MAX_BATCH_SIZE=$(( CONC > 16 ? CONC : 16 )) + +if [[ "$DP_ATTENTION" == "true" ]]; then + MOE_BACKEND="CUTEDSL" + MODE_CONFIG="attention_dp_config: + enable_balance: true + batching_wait_iters: 10 + timeout_iters: 500" +else + MOE_BACKEND="TRTLLM" + MODE_CONFIG="batch_wait_timeout_iters: 50 +batch_wait_max_tokens_ratio: 0.45" +fi + +cat > "$EXTRA_CONFIG_FILE" << EOF +backend: pytorch +print_iter_log: true +enable_layerwise_nvtx_marker: false +disable_overlap_scheduler: false +enable_iter_perf_stats: true +enable_chunked_prefill: false +stream_interval: 20 +num_postprocess_workers: 4 +enable_attention_dp: $DP_ATTENTION +scheduler_config: + capacity_scheduler_policy: MAX_UTILIZATION + context_chunking_policy: FIRST_COME_FIRST_SERVED +kv_cache_config: + free_gpu_memory_fraction: 0.9 + enable_block_reuse: false + dtype: fp8 +cuda_graph_config: + enable_padding: true + max_batch_size: $MAX_BATCH_SIZE +moe_config: + backend: $MOE_BACKEND + use_low_precision_moe_combine: true +$MODE_CONFIG +EOF + +echo "Generated config file contents:" +cat "$EXTRA_CONFIG_FILE" + +MAX_MODEL_LEN=$(( MAX_MODEL_LEN > 8192 ? MAX_MODEL_LEN : 8192 )) +MAX_NUM_TOKENS=$(( ISL + OSL + 256 )) +MAX_NUM_TOKENS=$(( MAX_NUM_TOKENS > 8192 ? MAX_NUM_TOKENS : 8192 )) + +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" + MAX_NUM_TOKENS="$EVAL_MAX_MODEL_LEN" +fi + +# Start GPU monitoring (power, temperature, clocks every second) +start_gpu_monitor + +set -x +mpirun -n 1 --oversubscribe --allow-run-as-root \ + trtllm-serve "$MODEL" --port="$PORT" \ + --trust_remote_code \ + --backend=pytorch \ + --max_batch_size "$MAX_BATCH_SIZE" \ + --max_seq_len="$MAX_MODEL_LEN" \ + --max_num_tokens="$MAX_NUM_TOKENS" \ + --tp_size="$TP" --ep_size="$EP_SIZE" \ + --extra_llm_api_options="$EXTRA_CONFIG_FILE" \ + > "$SERVER_LOG" 2>&1 & + +SERVER_PID=$! + +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend openai \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$(( CONC * 10 ))" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +# Stop GPU monitoring +stop_gpu_monitor +set +x diff --git a/perf-changelog.yaml b/perf-changelog.yaml index fd6b08dee..5bca02610 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3636,3 +3636,9 @@ - "Add MiniMax-M2.5 FP4 (NVFP4) B300 TensorRT-LLM benchmark (model: nvidia/MiniMax-M2.5-NVFP4)" - "Image: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc18" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1712 + +- config-keys: + - qwen3.5-fp4-b200-trt + description: + - "Add Qwen3.5-397B-A17B-NVFP4 B200 single-node TensorRT-LLM benchmark (1k/1k and 8k/1k) with a TP/TEP/DEP parallelism sweep" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1711