From 4af35fb89f91410bfba992113da557e66da4b610 Mon Sep 17 00:00:00 2001
From: Xianjie <5410381+qiaoxj07@users.noreply.github.com>
Date: Tue, 9 Jun 2026 13:11:26 +0800
Subject: [PATCH 1/6] Sync dsv4-fp4-b200-trt recipes with 0608-B200 agg
 frontier config

Sync the local B200 DeepSeek-V4-Pro aggregated frontier configs
(deepseek-v4-pro-agg-202606, 0608-B200) into the TensorRT-LLM single-node
recipes and document the provenance in the sweep config.

dsv4_fp4_b200_trt.sh (from the MTP0 blocks):
- Add worker tuning envs from env_vars (worker_env_var + gen_worker_env_var):
  TRTLLM_SERVER_DISABLE_GC, TRTLLM_WORKER_DISABLE_GC, NCCL_GRAPH_MIXING_SUPPORT,
  MIMALLOC_PURGE_DELAY, PYTORCH_CUDA_ALLOC_CONF (all overridable). The
  user-specific TLLM_AUTOTUNER_CACHE_PATH is intentionally omitted.
- kv_cache free_gpu_memory_fraction: 0.9 (TP) / 0.7 (DP-attn), was 0.50.
- attention_dp_config: batching_wait_iters 0 -> 30, drop timeout_iters.
- stream_interval 10 -> 100; moe_config use_low_precision_moe_combine: true.
- Make MOE_BACKEND overridable (default TRTLLM).

dsv4_fp4_b200_trt_mtp.sh (from the MTP blocks): same as above, with DP-attn
free_gpu_memory_fraction 0.6, enable_lm_head_tp_in_adp: true on the DP-attn
path, and default MTP level 2 -> 3.

cuda_graph_config / max_batch_size and the max_seq_len / max_num_tokens sizing
are kept as-is (the latter floored for random-range-ratio safety). The MTP field
stays num_nextn_predict_layers (repo/image convention) rather than the
spreadsheet's max_draft_len.
---
 .github/configs/nvidia-master.yaml            |  7 ++++
 .../fixed_seq_len/dsv4_fp4_b200_trt.sh        | 29 +++++++++++---
 .../fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh    | 38 +++++++++++++++----
 3 files changed, 60 insertions(+), 14 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index a02749d4d..62cfef2a3 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1800,6 +1800,10 @@ dsv4-fp4-b200-vllm-agentic:
       - { tp: 8, offloading: cpu,  conc-list: [16, 32, 64] }
       - { tp: 8, ep: 8, dp-attn: true, offloading: cpu,  conc-list: [64, 128, 256] }
 
+# Recipe config (worker envs + trtllm-serve YAML in dsv4_fp4_b200_trt.sh) synced
+# from the 0608-B200 DeepSeek-V4-Pro agg frontier (deepseek-v4-pro-agg-202606,
+# MTP0 blocks): free_gpu_memory_fraction 0.9/0.7 (TP / DP-attn), stream_interval
+# 100, attention_dp batching_wait_iters 30, moe use_low_precision_moe_combine.
 dsv4-fp4-b200-trt:
   image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-9aa3715
   model: deepseek-ai/DeepSeek-V4-Pro
@@ -1821,6 +1825,9 @@ dsv4-fp4-b200-trt:
       - { tp: 8, conc-start: 1, conc-end: 32 }
       - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 1024 }
 
+# Recipe config (dsv4_fp4_b200_trt_mtp.sh) synced from the 0608-B200 agg frontier
+# (MTP blocks): MTP level 3, free_gpu_memory_fraction 0.9/0.6 (TP / DP-attn),
+# enable_lm_head_tp_in_adp on the DP-attn path; other fields as dsv4-fp4-b200-trt.
 dsv4-fp4-b200-trt-mtp:
   image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-9aa3715
   model: deepseek-ai/DeepSeek-V4-Pro
diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt.sh
index e4a24dea2..e630e2c6e 100644
--- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt.sh
@@ -47,6 +47,15 @@ sanitize_slurm_mpi_env_for_trtllm
 export NCCL_NVLS_ENABLE="${NCCL_NVLS_ENABLE:-0}"
 echo "NCCL_NVLS_ENABLE: $NCCL_NVLS_ENABLE"
 
+# DeepSeek-V4 TRTLLM worker tuning envs, synced from the 0608-B200 agg frontier
+# (env_vars: worker_env_var + gen_worker_env_var; the user-specific
+# TLLM_AUTOTUNER_CACHE_PATH is intentionally omitted). All overridable.
+export TRTLLM_SERVER_DISABLE_GC="${TRTLLM_SERVER_DISABLE_GC:-1}"
+export TRTLLM_WORKER_DISABLE_GC="${TRTLLM_WORKER_DISABLE_GC:-1}"
+export NCCL_GRAPH_MIXING_SUPPORT="${NCCL_GRAPH_MIXING_SUPPORT:-0}"
+export MIMALLOC_PURGE_DELAY="${MIMALLOC_PURGE_DELAY:-0}"
+export PYTORCH_CUDA_ALLOC_CONF="${PYTORCH_CUDA_ALLOC_CONF:-expandable_segments:True}"
+
 if [[ "$MODEL" != /* ]]; then
     hf download "$MODEL"
 fi
@@ -56,18 +65,25 @@ nvidia-smi
 SERVER_LOG="$PWD/server.log"
 EXTRA_CONFIG_FILE="dsv4-fp4-trt.yml"
 
-MOE_BACKEND="TRTLLM"
+# MoE backend: TRTLLM is the frontier default across the sweep. The 0608-B200
+# data used MEGAMOE_DEEPGEMM only at the very top concurrency (1k1k conc=2048);
+# set MOE_BACKEND=MEGAMOE_DEEPGEMM to reproduce that point.
+MOE_BACKEND="${MOE_BACKEND:-TRTLLM}"
 MAX_BATCH_SIZE=$(( CONC > 16 ? CONC : 16 ))
 CUDA_GRAPH_MAX_BATCH_SIZE="$MAX_BATCH_SIZE"
-KV_CACHE_FREE_MEM_FRACTION="${KV_CACHE_FREE_MEM_FRACTION:-0.50}"
+# free_gpu_memory_fraction from 0608-B200: 0.9 (TP / no DP-attn), 0.7 (DP-attn).
+if [[ "$DP_ATTENTION" == "true" ]]; then
+    KV_CACHE_FREE_MEM_FRACTION="${KV_CACHE_FREE_MEM_FRACTION:-0.7}"
+else
+    KV_CACHE_FREE_MEM_FRACTION="${KV_CACHE_FREE_MEM_FRACTION:-0.9}"
+fi
 
 ATTENTION_DP_CONFIG=""
 if [[ "$DP_ATTENTION" == "true" ]]; then
     ATTENTION_DP_CONFIG="
 attention_dp_config:
-    batching_wait_iters: 0
-    enable_balance: true
-    timeout_iters: 60"
+    batching_wait_iters: 30
+    enable_balance: true"
 fi
 
 cat > "$EXTRA_CONFIG_FILE" << EOF
@@ -81,10 +97,11 @@ kv_cache_config:
     dtype: fp8
     free_gpu_memory_fraction: $KV_CACHE_FREE_MEM_FRACTION
     enable_block_reuse: false
-stream_interval: 10
+stream_interval: 100
 num_postprocess_workers: 4
 moe_config:
     backend: $MOE_BACKEND
+    use_low_precision_moe_combine: true
 EOF
 
 echo "Generated config file contents:"
diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh
index 9e5c88212..cf3d52ddf 100644
--- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh
@@ -46,6 +46,15 @@ sanitize_slurm_mpi_env_for_trtllm
 export NCCL_NVLS_ENABLE="${NCCL_NVLS_ENABLE:-0}"
 echo "NCCL_NVLS_ENABLE: $NCCL_NVLS_ENABLE"
 
+# DeepSeek-V4 TRTLLM worker tuning envs, synced from the 0608-B200 agg frontier
+# (env_vars: worker_env_var + gen_worker_env_var; the user-specific
+# TLLM_AUTOTUNER_CACHE_PATH is intentionally omitted). All overridable.
+export TRTLLM_SERVER_DISABLE_GC="${TRTLLM_SERVER_DISABLE_GC:-1}"
+export TRTLLM_WORKER_DISABLE_GC="${TRTLLM_WORKER_DISABLE_GC:-1}"
+export NCCL_GRAPH_MIXING_SUPPORT="${NCCL_GRAPH_MIXING_SUPPORT:-0}"
+export MIMALLOC_PURGE_DELAY="${MIMALLOC_PURGE_DELAY:-0}"
+export PYTORCH_CUDA_ALLOC_CONF="${PYTORCH_CUDA_ALLOC_CONF:-expandable_segments:True}"
+
 if [[ "$MODEL" != /* ]]; then
     hf download "$MODEL"
 fi
@@ -55,36 +64,49 @@ nvidia-smi
 SERVER_LOG="$PWD/server.log"
 EXTRA_CONFIG_FILE="dsv4-fp4-trt-mtp.yml"
 
-MOE_BACKEND="TRTLLM"
-MTP="${TRTLLM_DSV4_MTP_NUM_NEXTN_LAYERS:-2}"
+# MoE backend: TRTLLM is the frontier default across the sweep. The 0608-B200
+# MTP data used MEGAMOE_DEEPGEMM only at the top concurrencies (1k1k conc>=512);
+# set MOE_BACKEND=MEGAMOE_DEEPGEMM to reproduce those points.
+MOE_BACKEND="${MOE_BACKEND:-TRTLLM}"
+# 0608-B200 MTP frontier runs at MTP level 3 (overridable).
+MTP="${TRTLLM_DSV4_MTP_NUM_NEXTN_LAYERS:-3}"
 MAX_BATCH_SIZE=$(( CONC > 16 ? CONC : 16 ))
 CUDA_GRAPH_MAX_BATCH_SIZE="$MAX_BATCH_SIZE"
-KV_CACHE_FREE_MEM_FRACTION="${KV_CACHE_FREE_MEM_FRACTION:-0.50}"
+# free_gpu_memory_fraction from 0608-B200 MTP: 0.9 (TP / no DP-attn), 0.6 (DP-attn).
+if [[ "$DP_ATTENTION" == "true" ]]; then
+    KV_CACHE_FREE_MEM_FRACTION="${KV_CACHE_FREE_MEM_FRACTION:-0.6}"
+else
+    KV_CACHE_FREE_MEM_FRACTION="${KV_CACHE_FREE_MEM_FRACTION:-0.9}"
+fi
 
+# enable_lm_head_tp_in_adp: the 0608-B200 MTP frontier sets this on the DP-attn path.
+LM_HEAD_TP_IN_ADP_CONFIG=""
 ATTENTION_DP_CONFIG=""
 if [[ "$DP_ATTENTION" == "true" ]]; then
+    LM_HEAD_TP_IN_ADP_CONFIG="
+enable_lm_head_tp_in_adp: true"
     ATTENTION_DP_CONFIG="
 attention_dp_config:
-    batching_wait_iters: 0
-    enable_balance: true
-    timeout_iters: 60"
+    batching_wait_iters: 30
+    enable_balance: true"
 fi
 
 cat > "$EXTRA_CONFIG_FILE" << EOF
 cuda_graph_config:
     enable_padding: true
     max_batch_size: $CUDA_GRAPH_MAX_BATCH_SIZE
-enable_attention_dp: $DP_ATTENTION$ATTENTION_DP_CONFIG
+enable_attention_dp: $DP_ATTENTION$ATTENTION_DP_CONFIG$LM_HEAD_TP_IN_ADP_CONFIG
 print_iter_log: true
 kv_cache_config:
     tokens_per_block: 128
     dtype: fp8
     free_gpu_memory_fraction: $KV_CACHE_FREE_MEM_FRACTION
     enable_block_reuse: false
-stream_interval: 10
+stream_interval: 100
 num_postprocess_workers: 4
 moe_config:
     backend: $MOE_BACKEND
+    use_low_precision_moe_combine: true
 speculative_config:
     decoding_type: MTP
     num_nextn_predict_layers: $MTP

From a532ed681e2b58604458ef09c7dd1d4ef3154b6d Mon Sep 17 00:00:00 2001
From: Xianjie <5410381+qiaoxj07@users.noreply.github.com>
Date: Tue, 9 Jun 2026 13:20:55 +0800
Subject: [PATCH 2/6] Drop provenance comments; fold lm_head_tp_in_adp into
 ATTENTION_DP_CONFIG

- Remove the 0608-B200 provenance comments from the recipes and nvidia-master.yaml.
- In dsv4_fp4_b200_trt_mtp.sh, emit enable_lm_head_tp_in_adp from within the
  DP-attn ATTENTION_DP_CONFIG block instead of a separate variable, so the
  heredoc line matches the non-MTP recipe idiom. Generated YAML is unchanged.
---
 .github/configs/nvidia-master.yaml              |  7 -------
 .../fixed_seq_len/dsv4_fp4_b200_trt.sh          |  7 -------
 .../fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh      | 17 +++--------------
 3 files changed, 3 insertions(+), 28 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 62cfef2a3..a02749d4d 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1800,10 +1800,6 @@ dsv4-fp4-b200-vllm-agentic:
       - { tp: 8, offloading: cpu,  conc-list: [16, 32, 64] }
       - { tp: 8, ep: 8, dp-attn: true, offloading: cpu,  conc-list: [64, 128, 256] }
 
-# Recipe config (worker envs + trtllm-serve YAML in dsv4_fp4_b200_trt.sh) synced
-# from the 0608-B200 DeepSeek-V4-Pro agg frontier (deepseek-v4-pro-agg-202606,
-# MTP0 blocks): free_gpu_memory_fraction 0.9/0.7 (TP / DP-attn), stream_interval
-# 100, attention_dp batching_wait_iters 30, moe use_low_precision_moe_combine.
 dsv4-fp4-b200-trt:
   image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-9aa3715
   model: deepseek-ai/DeepSeek-V4-Pro
@@ -1825,9 +1821,6 @@ dsv4-fp4-b200-trt:
       - { tp: 8, conc-start: 1, conc-end: 32 }
       - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 1024 }
 
-# Recipe config (dsv4_fp4_b200_trt_mtp.sh) synced from the 0608-B200 agg frontier
-# (MTP blocks): MTP level 3, free_gpu_memory_fraction 0.9/0.6 (TP / DP-attn),
-# enable_lm_head_tp_in_adp on the DP-attn path; other fields as dsv4-fp4-b200-trt.
 dsv4-fp4-b200-trt-mtp:
   image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-9aa3715
   model: deepseek-ai/DeepSeek-V4-Pro
diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt.sh
index e630e2c6e..115be8e50 100644
--- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt.sh
@@ -47,9 +47,6 @@ sanitize_slurm_mpi_env_for_trtllm
 export NCCL_NVLS_ENABLE="${NCCL_NVLS_ENABLE:-0}"
 echo "NCCL_NVLS_ENABLE: $NCCL_NVLS_ENABLE"
 
-# DeepSeek-V4 TRTLLM worker tuning envs, synced from the 0608-B200 agg frontier
-# (env_vars: worker_env_var + gen_worker_env_var; the user-specific
-# TLLM_AUTOTUNER_CACHE_PATH is intentionally omitted). All overridable.
 export TRTLLM_SERVER_DISABLE_GC="${TRTLLM_SERVER_DISABLE_GC:-1}"
 export TRTLLM_WORKER_DISABLE_GC="${TRTLLM_WORKER_DISABLE_GC:-1}"
 export NCCL_GRAPH_MIXING_SUPPORT="${NCCL_GRAPH_MIXING_SUPPORT:-0}"
@@ -65,13 +62,9 @@ nvidia-smi
 SERVER_LOG="$PWD/server.log"
 EXTRA_CONFIG_FILE="dsv4-fp4-trt.yml"
 
-# MoE backend: TRTLLM is the frontier default across the sweep. The 0608-B200
-# data used MEGAMOE_DEEPGEMM only at the very top concurrency (1k1k conc=2048);
-# set MOE_BACKEND=MEGAMOE_DEEPGEMM to reproduce that point.
 MOE_BACKEND="${MOE_BACKEND:-TRTLLM}"
 MAX_BATCH_SIZE=$(( CONC > 16 ? CONC : 16 ))
 CUDA_GRAPH_MAX_BATCH_SIZE="$MAX_BATCH_SIZE"
-# free_gpu_memory_fraction from 0608-B200: 0.9 (TP / no DP-attn), 0.7 (DP-attn).
 if [[ "$DP_ATTENTION" == "true" ]]; then
     KV_CACHE_FREE_MEM_FRACTION="${KV_CACHE_FREE_MEM_FRACTION:-0.7}"
 else
diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh
index cf3d52ddf..6f8f06683 100644
--- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh
@@ -46,9 +46,6 @@ sanitize_slurm_mpi_env_for_trtllm
 export NCCL_NVLS_ENABLE="${NCCL_NVLS_ENABLE:-0}"
 echo "NCCL_NVLS_ENABLE: $NCCL_NVLS_ENABLE"
 
-# DeepSeek-V4 TRTLLM worker tuning envs, synced from the 0608-B200 agg frontier
-# (env_vars: worker_env_var + gen_worker_env_var; the user-specific
-# TLLM_AUTOTUNER_CACHE_PATH is intentionally omitted). All overridable.
 export TRTLLM_SERVER_DISABLE_GC="${TRTLLM_SERVER_DISABLE_GC:-1}"
 export TRTLLM_WORKER_DISABLE_GC="${TRTLLM_WORKER_DISABLE_GC:-1}"
 export NCCL_GRAPH_MIXING_SUPPORT="${NCCL_GRAPH_MIXING_SUPPORT:-0}"
@@ -64,38 +61,30 @@ nvidia-smi
 SERVER_LOG="$PWD/server.log"
 EXTRA_CONFIG_FILE="dsv4-fp4-trt-mtp.yml"
 
-# MoE backend: TRTLLM is the frontier default across the sweep. The 0608-B200
-# MTP data used MEGAMOE_DEEPGEMM only at the top concurrencies (1k1k conc>=512);
-# set MOE_BACKEND=MEGAMOE_DEEPGEMM to reproduce those points.
 MOE_BACKEND="${MOE_BACKEND:-TRTLLM}"
-# 0608-B200 MTP frontier runs at MTP level 3 (overridable).
 MTP="${TRTLLM_DSV4_MTP_NUM_NEXTN_LAYERS:-3}"
 MAX_BATCH_SIZE=$(( CONC > 16 ? CONC : 16 ))
 CUDA_GRAPH_MAX_BATCH_SIZE="$MAX_BATCH_SIZE"
-# free_gpu_memory_fraction from 0608-B200 MTP: 0.9 (TP / no DP-attn), 0.6 (DP-attn).
 if [[ "$DP_ATTENTION" == "true" ]]; then
     KV_CACHE_FREE_MEM_FRACTION="${KV_CACHE_FREE_MEM_FRACTION:-0.6}"
 else
     KV_CACHE_FREE_MEM_FRACTION="${KV_CACHE_FREE_MEM_FRACTION:-0.9}"
 fi
 
-# enable_lm_head_tp_in_adp: the 0608-B200 MTP frontier sets this on the DP-attn path.
-LM_HEAD_TP_IN_ADP_CONFIG=""
 ATTENTION_DP_CONFIG=""
 if [[ "$DP_ATTENTION" == "true" ]]; then
-    LM_HEAD_TP_IN_ADP_CONFIG="
-enable_lm_head_tp_in_adp: true"
     ATTENTION_DP_CONFIG="
 attention_dp_config:
     batching_wait_iters: 30
-    enable_balance: true"
+    enable_balance: true
+enable_lm_head_tp_in_adp: true"
 fi
 
 cat > "$EXTRA_CONFIG_FILE" << EOF
 cuda_graph_config:
     enable_padding: true
     max_batch_size: $CUDA_GRAPH_MAX_BATCH_SIZE
-enable_attention_dp: $DP_ATTENTION$ATTENTION_DP_CONFIG$LM_HEAD_TP_IN_ADP_CONFIG
+enable_attention_dp: $DP_ATTENTION$ATTENTION_DP_CONFIG
 print_iter_log: true
 kv_cache_config:
     tokens_per_block: 128

From b2bffb1a7c353b7a679cb1ac997133805416abf2 Mon Sep 17 00:00:00 2001
From: Xianjie <5410381+qiaoxj07@users.noreply.github.com>
Date: Tue, 9 Jun 2026 13:34:13 +0800
Subject: [PATCH 3/6] Extend MTP sweep to cover frontier; adapt MoE backend and
 MTP level by concurrency

Sweep (nvidia-master.yaml, dsv4-fp4-b200-trt-mtp):
- 1k1k DP conc-end 512 -> 1024; 8k1k DP conc-end 128 -> 256, so the MTP sweep
  reaches the top speculative frontier points. (conc=2048 / 8k1k conc=256 mtp0
  points are produced by the MTP0 sweep, which already covers them.)

Recipes: pick MoE backend / MTP level per concurrency to match the frontier,
since the single-node search-space cannot pass them per entry.
- dsv4_fp4_b200_trt.sh: MEGAMOE_DEEPGEMM for short ISL at conc >= 2048 (else TRTLLM).
- dsv4_fp4_b200_trt_mtp.sh: MEGAMOE_DEEPGEMM for short ISL at conc >= 512;
  MTP draft length steps 3 -> 2 for long ISL at conc >= 128.
All overridable via MOE_BACKEND / TRTLLM_DSV4_MTP_NUM_NEXTN_LAYERS.
---
 .github/configs/nvidia-master.yaml               |  4 ++--
 .../fixed_seq_len/dsv4_fp4_b200_trt.sh           |  8 +++++++-
 .../fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh       | 16 ++++++++++++++--
 3 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index a02749d4d..9b9dd7943 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1835,12 +1835,12 @@ dsv4-fp4-b200-trt-mtp:
       osl: 1024
       search-space:
       - { tp: 8, conc-start: 1, conc-end: 32, spec-decoding: mtp }
-      - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 512, spec-decoding: mtp }
+      - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 1024, spec-decoding: mtp }
     - isl: 8192
       osl: 1024
       search-space:
       - { tp: 8, conc-start: 1, conc-end: 32, spec-decoding: mtp }
-      - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 128, spec-decoding: mtp }
+      - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 256, spec-decoding: mtp }
 
 # MTP variant of dsv4-fp4-b200-vllm. Mirrors the base search space and adds
 # --speculative-config '{"method":"mtp","num_speculative_tokens":2}'.
diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt.sh
index 115be8e50..0999a7627 100644
--- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt.sh
@@ -62,7 +62,13 @@ nvidia-smi
 SERVER_LOG="$PWD/server.log"
 EXTRA_CONFIG_FILE="dsv4-fp4-trt.yml"
 
-MOE_BACKEND="${MOE_BACKEND:-TRTLLM}"
+# MoE backend: TRTLLM at low/mid concurrency; switch to MEGAMOE_DEEPGEMM at the
+# top concurrency for short ISL (1k).
+if [[ "$ISL" -le 1024 && "$CONC" -ge 2048 ]]; then
+    MOE_BACKEND="${MOE_BACKEND:-MEGAMOE_DEEPGEMM}"
+else
+    MOE_BACKEND="${MOE_BACKEND:-TRTLLM}"
+fi
 MAX_BATCH_SIZE=$(( CONC > 16 ? CONC : 16 ))
 CUDA_GRAPH_MAX_BATCH_SIZE="$MAX_BATCH_SIZE"
 if [[ "$DP_ATTENTION" == "true" ]]; then
diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh
index 6f8f06683..b6c1bbc98 100644
--- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh
@@ -61,8 +61,20 @@ nvidia-smi
 SERVER_LOG="$PWD/server.log"
 EXTRA_CONFIG_FILE="dsv4-fp4-trt-mtp.yml"
 
-MOE_BACKEND="${MOE_BACKEND:-TRTLLM}"
-MTP="${TRTLLM_DSV4_MTP_NUM_NEXTN_LAYERS:-3}"
+# MoE backend: TRTLLM at low/mid concurrency; switch to MEGAMOE_DEEPGEMM at high
+# concurrency for short ISL (1k).
+if [[ "$ISL" -le 1024 && "$CONC" -ge 512 ]]; then
+    MOE_BACKEND="${MOE_BACKEND:-MEGAMOE_DEEPGEMM}"
+else
+    MOE_BACKEND="${MOE_BACKEND:-TRTLLM}"
+fi
+# MTP draft length: 3 at low/mid concurrency; steps down to 2 at high concurrency
+# for long ISL (8k).
+if [[ "$ISL" -ge 4096 && "$CONC" -ge 128 ]]; then
+    MTP="${TRTLLM_DSV4_MTP_NUM_NEXTN_LAYERS:-2}"
+else
+    MTP="${TRTLLM_DSV4_MTP_NUM_NEXTN_LAYERS:-3}"
+fi
 MAX_BATCH_SIZE=$(( CONC > 16 ? CONC : 16 ))
 CUDA_GRAPH_MAX_BATCH_SIZE="$MAX_BATCH_SIZE"
 if [[ "$DP_ATTENTION" == "true" ]]; then

From 0eda0d8d9786f3a23acd403661a8ae8bafad510b Mon Sep 17 00:00:00 2001
From: Xianjie <5410381+qiaoxj07@users.noreply.github.com>
Date: Tue, 9 Jun 2026 14:01:24 +0800
Subject: [PATCH 4/6] Use max_draft_len for MTP; drop OSL from max_num_tokens

- dsv4_fp4_b200_trt_mtp.sh: speculative_config field num_nextn_predict_layers
  -> max_draft_len (matches the agg config / deployed image).
- max_num_tokens drops the OSL term in both recipes (output tokens are emitted
  one at a time, not in the prefill chunk): ISL + 256 (MTP0) and
  ISL + (MTP+1)*batch + 256 (MTP, keeps the speculative-verification headroom).
  max_seq_len stays floored at >= 8192 as headroom for server-side chat-template
  tokens on the openai-chat path.
---
 benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt.sh     | 2 +-
 benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt.sh
index 0999a7627..ce567c908 100644
--- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt.sh
@@ -107,7 +107,7 @@ echo "Generated config file contents:"
 cat "$EXTRA_CONFIG_FILE"
 
 MAX_MODEL_LEN=$(( MAX_MODEL_LEN > 8192 ? MAX_MODEL_LEN : 8192 ))
-MAX_NUM_TOKENS=$(( ISL + OSL + 256 ))
+MAX_NUM_TOKENS=$(( ISL + 256 ))
 MAX_NUM_TOKENS=$(( MAX_NUM_TOKENS > 8192 ? MAX_NUM_TOKENS : 8192 ))
 
 if [ "${EVAL_ONLY}" = "true" ]; then
diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh
index b6c1bbc98..0c7f32363 100644
--- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh
@@ -110,14 +110,14 @@ moe_config:
     use_low_precision_moe_combine: true
 speculative_config:
     decoding_type: MTP
-    num_nextn_predict_layers: $MTP
+    max_draft_len: $MTP
 EOF
 
 echo "Generated config file contents:"
 cat "$EXTRA_CONFIG_FILE"
 
 MAX_MODEL_LEN=$(( MAX_MODEL_LEN > 8192 ? MAX_MODEL_LEN : 8192 ))
-MAX_NUM_TOKENS=$(( ISL + OSL + (MTP + 1) * MAX_BATCH_SIZE + 256 ))
+MAX_NUM_TOKENS=$(( ISL + (MTP + 1) * MAX_BATCH_SIZE + 256 ))
 MAX_NUM_TOKENS=$(( MAX_NUM_TOKENS > 8192 ? MAX_NUM_TOKENS : 8192 ))
 
 if [ "${EVAL_ONLY}" = "true" ]; then

From f5dfcb53a7e9b98731912cf3e3f65e56d78d0f46 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Wed, 10 Jun 2026 10:37:06 -0700
Subject: [PATCH 5/6] Update DSv4 B200 TRT image to feat-deepseek_v4-c185066

Bump the dsv4-fp4-b200-trt and dsv4-fp4-b200-trt-mtp images from
feat-deepseek_v4-9aa3715 to feat-deepseek_v4-c185066, and add a
perf-changelog entry covering the image bump and the B200 agg
frontier config sync already in this PR.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .github/configs/nvidia-master.yaml |  4 ++--
 perf-changelog.yaml                | 10 ++++++++++
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 9b9dd7943..fef984cf2 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1801,7 +1801,7 @@ dsv4-fp4-b200-vllm-agentic:
       - { tp: 8, ep: 8, dp-attn: true, offloading: cpu,  conc-list: [64, 128, 256] }
 
 dsv4-fp4-b200-trt:
-  image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-9aa3715
+  image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-c185066
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: b200-dsv4
@@ -1822,7 +1822,7 @@ dsv4-fp4-b200-trt:
       - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 1024 }
 
 dsv4-fp4-b200-trt-mtp:
-  image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-9aa3715
+  image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-c185066
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: b200-dsv4
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 5622173f1..c06b73b6a 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3531,3 +3531,13 @@
     - "The Rust frontend replaces only the Python serving/API layer (HTTP, tokenization, scheduling glue, detokenization) and spawns the same Python EngineCore, so GPU kernels/attention/MoE GEMM/KV cache are untouched"
     - "A/B sweep (28 single-node points, 1k1k + 8k1k, TP 1/2/4) vs the Python-frontend baseline (run 26696260751): throughput Pareto-neutral (peak tok/s/GPU within <1.5%, frontiers coincident) and TPOT flat (+-0.5%); TTFT improves ~8% at 1k1k and ~22% at 8k1k (every point), the expected signature of lower frontend CPU latency before first token, scaling with input length"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1634
+
+- config-keys:
+    - dsv4-fp4-b200-trt
+    - dsv4-fp4-b200-trt-mtp
+  description:
+    - "Update the B200 TensorRT-LLM DeepSeek-V4-Pro image to ghcr.io/semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-c185066"
+    - "Sync the dsv4-fp4-b200-trt and dsv4-fp4-b200-trt-mtp recipes with the B200 aggregated frontier config (worker GC off, NCCL graph mixing off, mimalloc/PyTorch alloc tweaks, higher KV cache fractions by DP path, stream_interval 100, use_low_precision_moe_combine, DP batching_wait_iters 30, max_num_tokens drops the OSL term)"
+    - "MTP recipe uses max_draft_len with a variable default draft length, enable_lm_head_tp_in_adp on the DP-attn path, and removes timeout_iters from the DP config"
+    - "Raise dsv4-fp4-b200-trt-mtp DP-attn conc-end (1k ISL: 512->1024; 8k ISL: 128->256) to cover the new high-concurrency regime"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1699

From 8ef14b8e60c18490b6dd81d75114737dd9bccf99 Mon Sep 17 00:00:00 2001
From: Bryan Shan <58582368+Oseltamivir@users.noreply.github.com>
Date: Thu, 11 Jun 2026 23:14:45 -0700
Subject: [PATCH 6/6] Update nvidia-master.yaml

---
 .github/configs/nvidia-master.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 8b4468d92..a5b2fc4b3 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1814,12 +1814,12 @@ dsv4-fp4-b200-trt:
       osl: 1024
       search-space:
       - { tp: 8, conc-start: 1, conc-end: 32 }
-      - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 2048 }
+      - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 512 }
     - isl: 8192
       osl: 1024
       search-space:
       - { tp: 8, conc-start: 1, conc-end: 32 }
-      - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 1024 }
+      - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 512 }
 
 dsv4-fp4-b200-trt-mtp:
   image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-c185066
@@ -1835,7 +1835,7 @@ dsv4-fp4-b200-trt-mtp:
       osl: 1024
       search-space:
       - { tp: 8, conc-start: 1, conc-end: 32, spec-decoding: mtp }
-      - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 1024, spec-decoding: mtp }
+      - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 256, spec-decoding: mtp }
     - isl: 8192
       osl: 1024
       search-space: