From 4af35fb89f91410bfba992113da557e66da4b610 Mon Sep 17 00:00:00 2001 From: Xianjie <5410381+qiaoxj07@users.noreply.github.com> Date: Tue, 9 Jun 2026 13:11:26 +0800 Subject: [PATCH 1/6] Sync dsv4-fp4-b200-trt recipes with 0608-B200 agg frontier config Sync the local B200 DeepSeek-V4-Pro aggregated frontier configs (deepseek-v4-pro-agg-202606, 0608-B200) into the TensorRT-LLM single-node recipes and document the provenance in the sweep config. dsv4_fp4_b200_trt.sh (from the MTP0 blocks): - Add worker tuning envs from env_vars (worker_env_var + gen_worker_env_var): TRTLLM_SERVER_DISABLE_GC, TRTLLM_WORKER_DISABLE_GC, NCCL_GRAPH_MIXING_SUPPORT, MIMALLOC_PURGE_DELAY, PYTORCH_CUDA_ALLOC_CONF (all overridable). The user-specific TLLM_AUTOTUNER_CACHE_PATH is intentionally omitted. - kv_cache free_gpu_memory_fraction: 0.9 (TP) / 0.7 (DP-attn), was 0.50. - attention_dp_config: batching_wait_iters 0 -> 30, drop timeout_iters. - stream_interval 10 -> 100; moe_config use_low_precision_moe_combine: true. - Make MOE_BACKEND overridable (default TRTLLM). dsv4_fp4_b200_trt_mtp.sh (from the MTP blocks): same as above, with DP-attn free_gpu_memory_fraction 0.6, enable_lm_head_tp_in_adp: true on the DP-attn path, and default MTP level 2 -> 3. cuda_graph_config / max_batch_size and the max_seq_len / max_num_tokens sizing are kept as-is (the latter floored for random-range-ratio safety). The MTP field stays num_nextn_predict_layers (repo/image convention) rather than the spreadsheet's max_draft_len. --- .github/configs/nvidia-master.yaml | 7 ++++ .../fixed_seq_len/dsv4_fp4_b200_trt.sh | 29 +++++++++++--- .../fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh | 38 +++++++++++++++---- 3 files changed, 60 insertions(+), 14 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index a02749d4d..62cfef2a3 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1800,6 +1800,10 @@ dsv4-fp4-b200-vllm-agentic: - { tp: 8, offloading: cpu, conc-list: [16, 32, 64] } - { tp: 8, ep: 8, dp-attn: true, offloading: cpu, conc-list: [64, 128, 256] } +# Recipe config (worker envs + trtllm-serve YAML in dsv4_fp4_b200_trt.sh) synced +# from the 0608-B200 DeepSeek-V4-Pro agg frontier (deepseek-v4-pro-agg-202606, +# MTP0 blocks): free_gpu_memory_fraction 0.9/0.7 (TP / DP-attn), stream_interval +# 100, attention_dp batching_wait_iters 30, moe use_low_precision_moe_combine. dsv4-fp4-b200-trt: image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-9aa3715 model: deepseek-ai/DeepSeek-V4-Pro @@ -1821,6 +1825,9 @@ dsv4-fp4-b200-trt: - { tp: 8, conc-start: 1, conc-end: 32 } - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 1024 } +# Recipe config (dsv4_fp4_b200_trt_mtp.sh) synced from the 0608-B200 agg frontier +# (MTP blocks): MTP level 3, free_gpu_memory_fraction 0.9/0.6 (TP / DP-attn), +# enable_lm_head_tp_in_adp on the DP-attn path; other fields as dsv4-fp4-b200-trt. dsv4-fp4-b200-trt-mtp: image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-9aa3715 model: deepseek-ai/DeepSeek-V4-Pro diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt.sh index e4a24dea2..e630e2c6e 100644 --- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt.sh @@ -47,6 +47,15 @@ sanitize_slurm_mpi_env_for_trtllm export NCCL_NVLS_ENABLE="${NCCL_NVLS_ENABLE:-0}" echo "NCCL_NVLS_ENABLE: $NCCL_NVLS_ENABLE" +# DeepSeek-V4 TRTLLM worker tuning envs, synced from the 0608-B200 agg frontier +# (env_vars: worker_env_var + gen_worker_env_var; the user-specific +# TLLM_AUTOTUNER_CACHE_PATH is intentionally omitted). All overridable. +export TRTLLM_SERVER_DISABLE_GC="${TRTLLM_SERVER_DISABLE_GC:-1}" +export TRTLLM_WORKER_DISABLE_GC="${TRTLLM_WORKER_DISABLE_GC:-1}" +export NCCL_GRAPH_MIXING_SUPPORT="${NCCL_GRAPH_MIXING_SUPPORT:-0}" +export MIMALLOC_PURGE_DELAY="${MIMALLOC_PURGE_DELAY:-0}" +export PYTORCH_CUDA_ALLOC_CONF="${PYTORCH_CUDA_ALLOC_CONF:-expandable_segments:True}" + if [[ "$MODEL" != /* ]]; then hf download "$MODEL" fi @@ -56,18 +65,25 @@ nvidia-smi SERVER_LOG="$PWD/server.log" EXTRA_CONFIG_FILE="dsv4-fp4-trt.yml" -MOE_BACKEND="TRTLLM" +# MoE backend: TRTLLM is the frontier default across the sweep. The 0608-B200 +# data used MEGAMOE_DEEPGEMM only at the very top concurrency (1k1k conc=2048); +# set MOE_BACKEND=MEGAMOE_DEEPGEMM to reproduce that point. +MOE_BACKEND="${MOE_BACKEND:-TRTLLM}" MAX_BATCH_SIZE=$(( CONC > 16 ? CONC : 16 )) CUDA_GRAPH_MAX_BATCH_SIZE="$MAX_BATCH_SIZE" -KV_CACHE_FREE_MEM_FRACTION="${KV_CACHE_FREE_MEM_FRACTION:-0.50}" +# free_gpu_memory_fraction from 0608-B200: 0.9 (TP / no DP-attn), 0.7 (DP-attn). +if [[ "$DP_ATTENTION" == "true" ]]; then + KV_CACHE_FREE_MEM_FRACTION="${KV_CACHE_FREE_MEM_FRACTION:-0.7}" +else + KV_CACHE_FREE_MEM_FRACTION="${KV_CACHE_FREE_MEM_FRACTION:-0.9}" +fi ATTENTION_DP_CONFIG="" if [[ "$DP_ATTENTION" == "true" ]]; then ATTENTION_DP_CONFIG=" attention_dp_config: - batching_wait_iters: 0 - enable_balance: true - timeout_iters: 60" + batching_wait_iters: 30 + enable_balance: true" fi cat > "$EXTRA_CONFIG_FILE" << EOF @@ -81,10 +97,11 @@ kv_cache_config: dtype: fp8 free_gpu_memory_fraction: $KV_CACHE_FREE_MEM_FRACTION enable_block_reuse: false -stream_interval: 10 +stream_interval: 100 num_postprocess_workers: 4 moe_config: backend: $MOE_BACKEND + use_low_precision_moe_combine: true EOF echo "Generated config file contents:" diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh index 9e5c88212..cf3d52ddf 100644 --- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh @@ -46,6 +46,15 @@ sanitize_slurm_mpi_env_for_trtllm export NCCL_NVLS_ENABLE="${NCCL_NVLS_ENABLE:-0}" echo "NCCL_NVLS_ENABLE: $NCCL_NVLS_ENABLE" +# DeepSeek-V4 TRTLLM worker tuning envs, synced from the 0608-B200 agg frontier +# (env_vars: worker_env_var + gen_worker_env_var; the user-specific +# TLLM_AUTOTUNER_CACHE_PATH is intentionally omitted). All overridable. +export TRTLLM_SERVER_DISABLE_GC="${TRTLLM_SERVER_DISABLE_GC:-1}" +export TRTLLM_WORKER_DISABLE_GC="${TRTLLM_WORKER_DISABLE_GC:-1}" +export NCCL_GRAPH_MIXING_SUPPORT="${NCCL_GRAPH_MIXING_SUPPORT:-0}" +export MIMALLOC_PURGE_DELAY="${MIMALLOC_PURGE_DELAY:-0}" +export PYTORCH_CUDA_ALLOC_CONF="${PYTORCH_CUDA_ALLOC_CONF:-expandable_segments:True}" + if [[ "$MODEL" != /* ]]; then hf download "$MODEL" fi @@ -55,36 +64,49 @@ nvidia-smi SERVER_LOG="$PWD/server.log" EXTRA_CONFIG_FILE="dsv4-fp4-trt-mtp.yml" -MOE_BACKEND="TRTLLM" -MTP="${TRTLLM_DSV4_MTP_NUM_NEXTN_LAYERS:-2}" +# MoE backend: TRTLLM is the frontier default across the sweep. The 0608-B200 +# MTP data used MEGAMOE_DEEPGEMM only at the top concurrencies (1k1k conc>=512); +# set MOE_BACKEND=MEGAMOE_DEEPGEMM to reproduce those points. +MOE_BACKEND="${MOE_BACKEND:-TRTLLM}" +# 0608-B200 MTP frontier runs at MTP level 3 (overridable). +MTP="${TRTLLM_DSV4_MTP_NUM_NEXTN_LAYERS:-3}" MAX_BATCH_SIZE=$(( CONC > 16 ? CONC : 16 )) CUDA_GRAPH_MAX_BATCH_SIZE="$MAX_BATCH_SIZE" -KV_CACHE_FREE_MEM_FRACTION="${KV_CACHE_FREE_MEM_FRACTION:-0.50}" +# free_gpu_memory_fraction from 0608-B200 MTP: 0.9 (TP / no DP-attn), 0.6 (DP-attn). +if [[ "$DP_ATTENTION" == "true" ]]; then + KV_CACHE_FREE_MEM_FRACTION="${KV_CACHE_FREE_MEM_FRACTION:-0.6}" +else + KV_CACHE_FREE_MEM_FRACTION="${KV_CACHE_FREE_MEM_FRACTION:-0.9}" +fi +# enable_lm_head_tp_in_adp: the 0608-B200 MTP frontier sets this on the DP-attn path. +LM_HEAD_TP_IN_ADP_CONFIG="" ATTENTION_DP_CONFIG="" if [[ "$DP_ATTENTION" == "true" ]]; then + LM_HEAD_TP_IN_ADP_CONFIG=" +enable_lm_head_tp_in_adp: true" ATTENTION_DP_CONFIG=" attention_dp_config: - batching_wait_iters: 0 - enable_balance: true - timeout_iters: 60" + batching_wait_iters: 30 + enable_balance: true" fi cat > "$EXTRA_CONFIG_FILE" << EOF cuda_graph_config: enable_padding: true max_batch_size: $CUDA_GRAPH_MAX_BATCH_SIZE -enable_attention_dp: $DP_ATTENTION$ATTENTION_DP_CONFIG +enable_attention_dp: $DP_ATTENTION$ATTENTION_DP_CONFIG$LM_HEAD_TP_IN_ADP_CONFIG print_iter_log: true kv_cache_config: tokens_per_block: 128 dtype: fp8 free_gpu_memory_fraction: $KV_CACHE_FREE_MEM_FRACTION enable_block_reuse: false -stream_interval: 10 +stream_interval: 100 num_postprocess_workers: 4 moe_config: backend: $MOE_BACKEND + use_low_precision_moe_combine: true speculative_config: decoding_type: MTP num_nextn_predict_layers: $MTP From a532ed681e2b58604458ef09c7dd1d4ef3154b6d Mon Sep 17 00:00:00 2001 From: Xianjie <5410381+qiaoxj07@users.noreply.github.com> Date: Tue, 9 Jun 2026 13:20:55 +0800 Subject: [PATCH 2/6] Drop provenance comments; fold lm_head_tp_in_adp into ATTENTION_DP_CONFIG - Remove the 0608-B200 provenance comments from the recipes and nvidia-master.yaml. - In dsv4_fp4_b200_trt_mtp.sh, emit enable_lm_head_tp_in_adp from within the DP-attn ATTENTION_DP_CONFIG block instead of a separate variable, so the heredoc line matches the non-MTP recipe idiom. Generated YAML is unchanged. --- .github/configs/nvidia-master.yaml | 7 ------- .../fixed_seq_len/dsv4_fp4_b200_trt.sh | 7 ------- .../fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh | 17 +++-------------- 3 files changed, 3 insertions(+), 28 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 62cfef2a3..a02749d4d 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1800,10 +1800,6 @@ dsv4-fp4-b200-vllm-agentic: - { tp: 8, offloading: cpu, conc-list: [16, 32, 64] } - { tp: 8, ep: 8, dp-attn: true, offloading: cpu, conc-list: [64, 128, 256] } -# Recipe config (worker envs + trtllm-serve YAML in dsv4_fp4_b200_trt.sh) synced -# from the 0608-B200 DeepSeek-V4-Pro agg frontier (deepseek-v4-pro-agg-202606, -# MTP0 blocks): free_gpu_memory_fraction 0.9/0.7 (TP / DP-attn), stream_interval -# 100, attention_dp batching_wait_iters 30, moe use_low_precision_moe_combine. dsv4-fp4-b200-trt: image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-9aa3715 model: deepseek-ai/DeepSeek-V4-Pro @@ -1825,9 +1821,6 @@ dsv4-fp4-b200-trt: - { tp: 8, conc-start: 1, conc-end: 32 } - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 1024 } -# Recipe config (dsv4_fp4_b200_trt_mtp.sh) synced from the 0608-B200 agg frontier -# (MTP blocks): MTP level 3, free_gpu_memory_fraction 0.9/0.6 (TP / DP-attn), -# enable_lm_head_tp_in_adp on the DP-attn path; other fields as dsv4-fp4-b200-trt. dsv4-fp4-b200-trt-mtp: image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-9aa3715 model: deepseek-ai/DeepSeek-V4-Pro diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt.sh index e630e2c6e..115be8e50 100644 --- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt.sh @@ -47,9 +47,6 @@ sanitize_slurm_mpi_env_for_trtllm export NCCL_NVLS_ENABLE="${NCCL_NVLS_ENABLE:-0}" echo "NCCL_NVLS_ENABLE: $NCCL_NVLS_ENABLE" -# DeepSeek-V4 TRTLLM worker tuning envs, synced from the 0608-B200 agg frontier -# (env_vars: worker_env_var + gen_worker_env_var; the user-specific -# TLLM_AUTOTUNER_CACHE_PATH is intentionally omitted). All overridable. export TRTLLM_SERVER_DISABLE_GC="${TRTLLM_SERVER_DISABLE_GC:-1}" export TRTLLM_WORKER_DISABLE_GC="${TRTLLM_WORKER_DISABLE_GC:-1}" export NCCL_GRAPH_MIXING_SUPPORT="${NCCL_GRAPH_MIXING_SUPPORT:-0}" @@ -65,13 +62,9 @@ nvidia-smi SERVER_LOG="$PWD/server.log" EXTRA_CONFIG_FILE="dsv4-fp4-trt.yml" -# MoE backend: TRTLLM is the frontier default across the sweep. The 0608-B200 -# data used MEGAMOE_DEEPGEMM only at the very top concurrency (1k1k conc=2048); -# set MOE_BACKEND=MEGAMOE_DEEPGEMM to reproduce that point. MOE_BACKEND="${MOE_BACKEND:-TRTLLM}" MAX_BATCH_SIZE=$(( CONC > 16 ? CONC : 16 )) CUDA_GRAPH_MAX_BATCH_SIZE="$MAX_BATCH_SIZE" -# free_gpu_memory_fraction from 0608-B200: 0.9 (TP / no DP-attn), 0.7 (DP-attn). if [[ "$DP_ATTENTION" == "true" ]]; then KV_CACHE_FREE_MEM_FRACTION="${KV_CACHE_FREE_MEM_FRACTION:-0.7}" else diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh index cf3d52ddf..6f8f06683 100644 --- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh @@ -46,9 +46,6 @@ sanitize_slurm_mpi_env_for_trtllm export NCCL_NVLS_ENABLE="${NCCL_NVLS_ENABLE:-0}" echo "NCCL_NVLS_ENABLE: $NCCL_NVLS_ENABLE" -# DeepSeek-V4 TRTLLM worker tuning envs, synced from the 0608-B200 agg frontier -# (env_vars: worker_env_var + gen_worker_env_var; the user-specific -# TLLM_AUTOTUNER_CACHE_PATH is intentionally omitted). All overridable. export TRTLLM_SERVER_DISABLE_GC="${TRTLLM_SERVER_DISABLE_GC:-1}" export TRTLLM_WORKER_DISABLE_GC="${TRTLLM_WORKER_DISABLE_GC:-1}" export NCCL_GRAPH_MIXING_SUPPORT="${NCCL_GRAPH_MIXING_SUPPORT:-0}" @@ -64,38 +61,30 @@ nvidia-smi SERVER_LOG="$PWD/server.log" EXTRA_CONFIG_FILE="dsv4-fp4-trt-mtp.yml" -# MoE backend: TRTLLM is the frontier default across the sweep. The 0608-B200 -# MTP data used MEGAMOE_DEEPGEMM only at the top concurrencies (1k1k conc>=512); -# set MOE_BACKEND=MEGAMOE_DEEPGEMM to reproduce those points. MOE_BACKEND="${MOE_BACKEND:-TRTLLM}" -# 0608-B200 MTP frontier runs at MTP level 3 (overridable). MTP="${TRTLLM_DSV4_MTP_NUM_NEXTN_LAYERS:-3}" MAX_BATCH_SIZE=$(( CONC > 16 ? CONC : 16 )) CUDA_GRAPH_MAX_BATCH_SIZE="$MAX_BATCH_SIZE" -# free_gpu_memory_fraction from 0608-B200 MTP: 0.9 (TP / no DP-attn), 0.6 (DP-attn). if [[ "$DP_ATTENTION" == "true" ]]; then KV_CACHE_FREE_MEM_FRACTION="${KV_CACHE_FREE_MEM_FRACTION:-0.6}" else KV_CACHE_FREE_MEM_FRACTION="${KV_CACHE_FREE_MEM_FRACTION:-0.9}" fi -# enable_lm_head_tp_in_adp: the 0608-B200 MTP frontier sets this on the DP-attn path. -LM_HEAD_TP_IN_ADP_CONFIG="" ATTENTION_DP_CONFIG="" if [[ "$DP_ATTENTION" == "true" ]]; then - LM_HEAD_TP_IN_ADP_CONFIG=" -enable_lm_head_tp_in_adp: true" ATTENTION_DP_CONFIG=" attention_dp_config: batching_wait_iters: 30 - enable_balance: true" + enable_balance: true +enable_lm_head_tp_in_adp: true" fi cat > "$EXTRA_CONFIG_FILE" << EOF cuda_graph_config: enable_padding: true max_batch_size: $CUDA_GRAPH_MAX_BATCH_SIZE -enable_attention_dp: $DP_ATTENTION$ATTENTION_DP_CONFIG$LM_HEAD_TP_IN_ADP_CONFIG +enable_attention_dp: $DP_ATTENTION$ATTENTION_DP_CONFIG print_iter_log: true kv_cache_config: tokens_per_block: 128 From b2bffb1a7c353b7a679cb1ac997133805416abf2 Mon Sep 17 00:00:00 2001 From: Xianjie <5410381+qiaoxj07@users.noreply.github.com> Date: Tue, 9 Jun 2026 13:34:13 +0800 Subject: [PATCH 3/6] Extend MTP sweep to cover frontier; adapt MoE backend and MTP level by concurrency Sweep (nvidia-master.yaml, dsv4-fp4-b200-trt-mtp): - 1k1k DP conc-end 512 -> 1024; 8k1k DP conc-end 128 -> 256, so the MTP sweep reaches the top speculative frontier points. (conc=2048 / 8k1k conc=256 mtp0 points are produced by the MTP0 sweep, which already covers them.) Recipes: pick MoE backend / MTP level per concurrency to match the frontier, since the single-node search-space cannot pass them per entry. - dsv4_fp4_b200_trt.sh: MEGAMOE_DEEPGEMM for short ISL at conc >= 2048 (else TRTLLM). - dsv4_fp4_b200_trt_mtp.sh: MEGAMOE_DEEPGEMM for short ISL at conc >= 512; MTP draft length steps 3 -> 2 for long ISL at conc >= 128. All overridable via MOE_BACKEND / TRTLLM_DSV4_MTP_NUM_NEXTN_LAYERS. --- .github/configs/nvidia-master.yaml | 4 ++-- .../fixed_seq_len/dsv4_fp4_b200_trt.sh | 8 +++++++- .../fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh | 16 ++++++++++++++-- 3 files changed, 23 insertions(+), 5 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index a02749d4d..9b9dd7943 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1835,12 +1835,12 @@ dsv4-fp4-b200-trt-mtp: osl: 1024 search-space: - { tp: 8, conc-start: 1, conc-end: 32, spec-decoding: mtp } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 512, spec-decoding: mtp } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 1024, spec-decoding: mtp } - isl: 8192 osl: 1024 search-space: - { tp: 8, conc-start: 1, conc-end: 32, spec-decoding: mtp } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 128, spec-decoding: mtp } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 256, spec-decoding: mtp } # MTP variant of dsv4-fp4-b200-vllm. Mirrors the base search space and adds # --speculative-config '{"method":"mtp","num_speculative_tokens":2}'. diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt.sh index 115be8e50..0999a7627 100644 --- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt.sh @@ -62,7 +62,13 @@ nvidia-smi SERVER_LOG="$PWD/server.log" EXTRA_CONFIG_FILE="dsv4-fp4-trt.yml" -MOE_BACKEND="${MOE_BACKEND:-TRTLLM}" +# MoE backend: TRTLLM at low/mid concurrency; switch to MEGAMOE_DEEPGEMM at the +# top concurrency for short ISL (1k). +if [[ "$ISL" -le 1024 && "$CONC" -ge 2048 ]]; then + MOE_BACKEND="${MOE_BACKEND:-MEGAMOE_DEEPGEMM}" +else + MOE_BACKEND="${MOE_BACKEND:-TRTLLM}" +fi MAX_BATCH_SIZE=$(( CONC > 16 ? CONC : 16 )) CUDA_GRAPH_MAX_BATCH_SIZE="$MAX_BATCH_SIZE" if [[ "$DP_ATTENTION" == "true" ]]; then diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh index 6f8f06683..b6c1bbc98 100644 --- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh @@ -61,8 +61,20 @@ nvidia-smi SERVER_LOG="$PWD/server.log" EXTRA_CONFIG_FILE="dsv4-fp4-trt-mtp.yml" -MOE_BACKEND="${MOE_BACKEND:-TRTLLM}" -MTP="${TRTLLM_DSV4_MTP_NUM_NEXTN_LAYERS:-3}" +# MoE backend: TRTLLM at low/mid concurrency; switch to MEGAMOE_DEEPGEMM at high +# concurrency for short ISL (1k). +if [[ "$ISL" -le 1024 && "$CONC" -ge 512 ]]; then + MOE_BACKEND="${MOE_BACKEND:-MEGAMOE_DEEPGEMM}" +else + MOE_BACKEND="${MOE_BACKEND:-TRTLLM}" +fi +# MTP draft length: 3 at low/mid concurrency; steps down to 2 at high concurrency +# for long ISL (8k). +if [[ "$ISL" -ge 4096 && "$CONC" -ge 128 ]]; then + MTP="${TRTLLM_DSV4_MTP_NUM_NEXTN_LAYERS:-2}" +else + MTP="${TRTLLM_DSV4_MTP_NUM_NEXTN_LAYERS:-3}" +fi MAX_BATCH_SIZE=$(( CONC > 16 ? CONC : 16 )) CUDA_GRAPH_MAX_BATCH_SIZE="$MAX_BATCH_SIZE" if [[ "$DP_ATTENTION" == "true" ]]; then From 0eda0d8d9786f3a23acd403661a8ae8bafad510b Mon Sep 17 00:00:00 2001 From: Xianjie <5410381+qiaoxj07@users.noreply.github.com> Date: Tue, 9 Jun 2026 14:01:24 +0800 Subject: [PATCH 4/6] Use max_draft_len for MTP; drop OSL from max_num_tokens - dsv4_fp4_b200_trt_mtp.sh: speculative_config field num_nextn_predict_layers -> max_draft_len (matches the agg config / deployed image). - max_num_tokens drops the OSL term in both recipes (output tokens are emitted one at a time, not in the prefill chunk): ISL + 256 (MTP0) and ISL + (MTP+1)*batch + 256 (MTP, keeps the speculative-verification headroom). max_seq_len stays floored at >= 8192 as headroom for server-side chat-template tokens on the openai-chat path. --- benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt.sh | 2 +- benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt.sh index 0999a7627..ce567c908 100644 --- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt.sh @@ -107,7 +107,7 @@ echo "Generated config file contents:" cat "$EXTRA_CONFIG_FILE" MAX_MODEL_LEN=$(( MAX_MODEL_LEN > 8192 ? MAX_MODEL_LEN : 8192 )) -MAX_NUM_TOKENS=$(( ISL + OSL + 256 )) +MAX_NUM_TOKENS=$(( ISL + 256 )) MAX_NUM_TOKENS=$(( MAX_NUM_TOKENS > 8192 ? MAX_NUM_TOKENS : 8192 )) if [ "${EVAL_ONLY}" = "true" ]; then diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh index b6c1bbc98..0c7f32363 100644 --- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh @@ -110,14 +110,14 @@ moe_config: use_low_precision_moe_combine: true speculative_config: decoding_type: MTP - num_nextn_predict_layers: $MTP + max_draft_len: $MTP EOF echo "Generated config file contents:" cat "$EXTRA_CONFIG_FILE" MAX_MODEL_LEN=$(( MAX_MODEL_LEN > 8192 ? MAX_MODEL_LEN : 8192 )) -MAX_NUM_TOKENS=$(( ISL + OSL + (MTP + 1) * MAX_BATCH_SIZE + 256 )) +MAX_NUM_TOKENS=$(( ISL + (MTP + 1) * MAX_BATCH_SIZE + 256 )) MAX_NUM_TOKENS=$(( MAX_NUM_TOKENS > 8192 ? MAX_NUM_TOKENS : 8192 )) if [ "${EVAL_ONLY}" = "true" ]; then From f5dfcb53a7e9b98731912cf3e3f65e56d78d0f46 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Wed, 10 Jun 2026 10:37:06 -0700 Subject: [PATCH 5/6] Update DSv4 B200 TRT image to feat-deepseek_v4-c185066 Bump the dsv4-fp4-b200-trt and dsv4-fp4-b200-trt-mtp images from feat-deepseek_v4-9aa3715 to feat-deepseek_v4-c185066, and add a perf-changelog entry covering the image bump and the B200 agg frontier config sync already in this PR. Co-Authored-By: Claude Opus 4.8 (1M context) --- .github/configs/nvidia-master.yaml | 4 ++-- perf-changelog.yaml | 10 ++++++++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 9b9dd7943..fef984cf2 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1801,7 +1801,7 @@ dsv4-fp4-b200-vllm-agentic: - { tp: 8, ep: 8, dp-attn: true, offloading: cpu, conc-list: [64, 128, 256] } dsv4-fp4-b200-trt: - image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-9aa3715 + image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-c185066 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b200-dsv4 @@ -1822,7 +1822,7 @@ dsv4-fp4-b200-trt: - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 1024 } dsv4-fp4-b200-trt-mtp: - image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-9aa3715 + image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-c185066 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b200-dsv4 diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 5622173f1..c06b73b6a 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3531,3 +3531,13 @@ - "The Rust frontend replaces only the Python serving/API layer (HTTP, tokenization, scheduling glue, detokenization) and spawns the same Python EngineCore, so GPU kernels/attention/MoE GEMM/KV cache are untouched" - "A/B sweep (28 single-node points, 1k1k + 8k1k, TP 1/2/4) vs the Python-frontend baseline (run 26696260751): throughput Pareto-neutral (peak tok/s/GPU within <1.5%, frontiers coincident) and TPOT flat (+-0.5%); TTFT improves ~8% at 1k1k and ~22% at 8k1k (every point), the expected signature of lower frontend CPU latency before first token, scaling with input length" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1634 + +- config-keys: + - dsv4-fp4-b200-trt + - dsv4-fp4-b200-trt-mtp + description: + - "Update the B200 TensorRT-LLM DeepSeek-V4-Pro image to ghcr.io/semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-c185066" + - "Sync the dsv4-fp4-b200-trt and dsv4-fp4-b200-trt-mtp recipes with the B200 aggregated frontier config (worker GC off, NCCL graph mixing off, mimalloc/PyTorch alloc tweaks, higher KV cache fractions by DP path, stream_interval 100, use_low_precision_moe_combine, DP batching_wait_iters 30, max_num_tokens drops the OSL term)" + - "MTP recipe uses max_draft_len with a variable default draft length, enable_lm_head_tp_in_adp on the DP-attn path, and removes timeout_iters from the DP config" + - "Raise dsv4-fp4-b200-trt-mtp DP-attn conc-end (1k ISL: 512->1024; 8k ISL: 128->256) to cover the new high-concurrency regime" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1699 From 8ef14b8e60c18490b6dd81d75114737dd9bccf99 Mon Sep 17 00:00:00 2001 From: Bryan Shan <58582368+Oseltamivir@users.noreply.github.com> Date: Thu, 11 Jun 2026 23:14:45 -0700 Subject: [PATCH 6/6] Update nvidia-master.yaml --- .github/configs/nvidia-master.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 8b4468d92..a5b2fc4b3 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1814,12 +1814,12 @@ dsv4-fp4-b200-trt: osl: 1024 search-space: - { tp: 8, conc-start: 1, conc-end: 32 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 2048 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 512 } - isl: 8192 osl: 1024 search-space: - { tp: 8, conc-start: 1, conc-end: 32 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 1024 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 512 } dsv4-fp4-b200-trt-mtp: image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-c185066 @@ -1835,7 +1835,7 @@ dsv4-fp4-b200-trt-mtp: osl: 1024 search-space: - { tp: 8, conc-start: 1, conc-end: 32, spec-decoding: mtp } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 1024, spec-decoding: mtp } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 256, spec-decoding: mtp } - isl: 8192 osl: 1024 search-space: