From fdd4f762d8b9b7a251148cb3fe9eac2b48b63248 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Wed, 10 Jun 2026 11:05:36 -0700
Subject: [PATCH 1/7] Sync dsv4-fp4-b300-trt recipes with B300 agg frontier
 config

B300 analog of PR #1699 (B200). Apply the same TensorRT-LLM recipe sync
to dsv4_fp4_b300_trt.sh (MTP0) and dsv4_fp4_b300_trt_mtp.sh (MTP), and
bump the dsv4-fp4-b300-trt / -mtp images to feat-deepseek_v4-c185066.

Recipe changes (both):
- Worker envs (overridable): TRTLLM_SERVER_DISABLE_GC, TRTLLM_WORKER_DISABLE_GC,
  NCCL_GRAPH_MIXING_SUPPORT=0, MIMALLOC_PURGE_DELAY=0,
  PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True.
- kv_cache free_gpu_memory_fraction: 0.9 (no DP-attn) / 0.7 non-MTP, 0.6 MTP
  (DP-attn), was 0.50.
- attention_dp_config batching_wait_iters 0 -> 30, drop timeout_iters.
- stream_interval 10 -> 100; moe_config.use_low_precision_moe_combine: true.
- MOE_BACKEND overridable, switches to MEGAMOE_DEEPGEMM at high conc on 1k ISL.
- max_num_tokens drops the OSL term.

MTP additionally: max_draft_len (was num_nextn_predict_layers), default draft
3 stepping to 2 at high conc on 8k ISL, enable_lm_head_tp_in_adp on DP-attn.

B300-specific bits preserved: MODEL_PATH download block, TRTLLM_MHC_ENABLE_FUSED_HC=1,
trtllm-serve "$MODEL_PATH". B300 search space left as-is (already covers the
high-concurrency frontier the recipe changes target).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .github/configs/nvidia-master.yaml            |  4 +-
 .../fixed_seq_len/dsv4_fp4_b300_trt.sh        | 30 ++++++++++----
 .../fixed_seq_len/dsv4_fp4_b300_trt_mtp.sh    | 39 +++++++++++++++----
 3 files changed, 56 insertions(+), 17 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index a02749d4d..9e0200d1f 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -3049,7 +3049,7 @@ dsv4-fp4-b300-vllm-agentic:
       - { tp: 8, ep: 8, dp-attn: true, offloading: cpu,  conc-list: [128, 256, 512] }
 
 dsv4-fp4-b300-trt:
-  image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-9aa3715
+  image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-c185066
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: b300
@@ -3072,7 +3072,7 @@ dsv4-fp4-b300-trt:
       - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 1024 }
 
 dsv4-fp4-b300-trt-mtp:
-  image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-9aa3715
+  image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-c185066
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: b300
diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt.sh
index b0150e10d..b23ab7a07 100644
--- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt.sh
@@ -59,23 +59,38 @@ sanitize_slurm_mpi_env_for_trtllm
 export NCCL_NVLS_ENABLE="${NCCL_NVLS_ENABLE:-0}"
 echo "NCCL_NVLS_ENABLE: $NCCL_NVLS_ENABLE"
 
+export TRTLLM_SERVER_DISABLE_GC="${TRTLLM_SERVER_DISABLE_GC:-1}"
+export TRTLLM_WORKER_DISABLE_GC="${TRTLLM_WORKER_DISABLE_GC:-1}"
+export NCCL_GRAPH_MIXING_SUPPORT="${NCCL_GRAPH_MIXING_SUPPORT:-0}"
+export MIMALLOC_PURGE_DELAY="${MIMALLOC_PURGE_DELAY:-0}"
+export PYTORCH_CUDA_ALLOC_CONF="${PYTORCH_CUDA_ALLOC_CONF:-expandable_segments:True}"
+
 nvidia-smi
 
 SERVER_LOG="$PWD/server.log"
 EXTRA_CONFIG_FILE="dsv4-fp4-trt.yml"
 
-MOE_BACKEND="TRTLLM"
+# MoE backend: TRTLLM at low/mid concurrency; switch to MEGAMOE_DEEPGEMM at the
+# top concurrency for short ISL (1k).
+if [[ "$ISL" -le 1024 && "$CONC" -ge 2048 ]]; then
+    MOE_BACKEND="${MOE_BACKEND:-MEGAMOE_DEEPGEMM}"
+else
+    MOE_BACKEND="${MOE_BACKEND:-TRTLLM}"
+fi
 MAX_BATCH_SIZE=$(( CONC > 16 ? CONC : 16 ))
 CUDA_GRAPH_MAX_BATCH_SIZE="$MAX_BATCH_SIZE"
-KV_CACHE_FREE_MEM_FRACTION="${KV_CACHE_FREE_MEM_FRACTION:-0.50}"
+if [[ "$DP_ATTENTION" == "true" ]]; then
+    KV_CACHE_FREE_MEM_FRACTION="${KV_CACHE_FREE_MEM_FRACTION:-0.7}"
+else
+    KV_CACHE_FREE_MEM_FRACTION="${KV_CACHE_FREE_MEM_FRACTION:-0.9}"
+fi
 
 ATTENTION_DP_CONFIG=""
 if [[ "$DP_ATTENTION" == "true" ]]; then
     ATTENTION_DP_CONFIG="
 attention_dp_config:
-    batching_wait_iters: 0
-    enable_balance: true
-    timeout_iters: 60"
+    batching_wait_iters: 30
+    enable_balance: true"
 fi
 
 cat > "$EXTRA_CONFIG_FILE" << EOF
@@ -89,17 +104,18 @@ kv_cache_config:
     dtype: fp8
     free_gpu_memory_fraction: $KV_CACHE_FREE_MEM_FRACTION
     enable_block_reuse: false
-stream_interval: 10
+stream_interval: 100
 num_postprocess_workers: 4
 moe_config:
     backend: $MOE_BACKEND
+    use_low_precision_moe_combine: true
 EOF
 
 echo "Generated config file contents:"
 cat "$EXTRA_CONFIG_FILE"
 
 MAX_MODEL_LEN=$(( MAX_MODEL_LEN > 8192 ? MAX_MODEL_LEN : 8192 ))
-MAX_NUM_TOKENS=$(( ISL + OSL + 256 ))
+MAX_NUM_TOKENS=$(( ISL + 256 ))
 MAX_NUM_TOKENS=$(( MAX_NUM_TOKENS > 8192 ? MAX_NUM_TOKENS : 8192 ))
 
 if [ "${EVAL_ONLY}" = "true" ]; then
diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt_mtp.sh
index 507b96e34..11a6c7b9c 100644
--- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt_mtp.sh
@@ -58,24 +58,46 @@ sanitize_slurm_mpi_env_for_trtllm
 export NCCL_NVLS_ENABLE="${NCCL_NVLS_ENABLE:-0}"
 echo "NCCL_NVLS_ENABLE: $NCCL_NVLS_ENABLE"
 
+export TRTLLM_SERVER_DISABLE_GC="${TRTLLM_SERVER_DISABLE_GC:-1}"
+export TRTLLM_WORKER_DISABLE_GC="${TRTLLM_WORKER_DISABLE_GC:-1}"
+export NCCL_GRAPH_MIXING_SUPPORT="${NCCL_GRAPH_MIXING_SUPPORT:-0}"
+export MIMALLOC_PURGE_DELAY="${MIMALLOC_PURGE_DELAY:-0}"
+export PYTORCH_CUDA_ALLOC_CONF="${PYTORCH_CUDA_ALLOC_CONF:-expandable_segments:True}"
+
 nvidia-smi
 
 SERVER_LOG="$PWD/server.log"
 EXTRA_CONFIG_FILE="dsv4-fp4-trt-mtp.yml"
 
-MOE_BACKEND="TRTLLM"
-MTP="${TRTLLM_DSV4_MTP_NUM_NEXTN_LAYERS:-2}"
+# MoE backend: TRTLLM at low/mid concurrency; switch to MEGAMOE_DEEPGEMM at high
+# concurrency for short ISL (1k).
+if [[ "$ISL" -le 1024 && "$CONC" -ge 512 ]]; then
+    MOE_BACKEND="${MOE_BACKEND:-MEGAMOE_DEEPGEMM}"
+else
+    MOE_BACKEND="${MOE_BACKEND:-TRTLLM}"
+fi
+# MTP draft length: 3 at low/mid concurrency; steps down to 2 at high concurrency
+# for long ISL (8k).
+if [[ "$ISL" -ge 4096 && "$CONC" -ge 128 ]]; then
+    MTP="${TRTLLM_DSV4_MTP_NUM_NEXTN_LAYERS:-2}"
+else
+    MTP="${TRTLLM_DSV4_MTP_NUM_NEXTN_LAYERS:-3}"
+fi
 MAX_BATCH_SIZE=$(( CONC > 16 ? CONC : 16 ))
 CUDA_GRAPH_MAX_BATCH_SIZE="$MAX_BATCH_SIZE"
-KV_CACHE_FREE_MEM_FRACTION="${KV_CACHE_FREE_MEM_FRACTION:-0.50}"
+if [[ "$DP_ATTENTION" == "true" ]]; then
+    KV_CACHE_FREE_MEM_FRACTION="${KV_CACHE_FREE_MEM_FRACTION:-0.6}"
+else
+    KV_CACHE_FREE_MEM_FRACTION="${KV_CACHE_FREE_MEM_FRACTION:-0.9}"
+fi
 
 ATTENTION_DP_CONFIG=""
 if [[ "$DP_ATTENTION" == "true" ]]; then
     ATTENTION_DP_CONFIG="
 attention_dp_config:
-    batching_wait_iters: 0
+    batching_wait_iters: 30
     enable_balance: true
-    timeout_iters: 60"
+enable_lm_head_tp_in_adp: true"
 fi
 
 cat > "$EXTRA_CONFIG_FILE" << EOF
@@ -89,20 +111,21 @@ kv_cache_config:
     dtype: fp8
     free_gpu_memory_fraction: $KV_CACHE_FREE_MEM_FRACTION
     enable_block_reuse: false
-stream_interval: 10
+stream_interval: 100
 num_postprocess_workers: 4
 moe_config:
     backend: $MOE_BACKEND
+    use_low_precision_moe_combine: true
 speculative_config:
     decoding_type: MTP
-    num_nextn_predict_layers: $MTP
+    max_draft_len: $MTP
 EOF
 
 echo "Generated config file contents:"
 cat "$EXTRA_CONFIG_FILE"
 
 MAX_MODEL_LEN=$(( MAX_MODEL_LEN > 8192 ? MAX_MODEL_LEN : 8192 ))
-MAX_NUM_TOKENS=$(( ISL + OSL + (MTP + 1) * MAX_BATCH_SIZE + 256 ))
+MAX_NUM_TOKENS=$(( ISL + (MTP + 1) * MAX_BATCH_SIZE + 256 ))
 MAX_NUM_TOKENS=$(( MAX_NUM_TOKENS > 8192 ? MAX_NUM_TOKENS : 8192 ))
 
 if [ "${EVAL_ONLY}" = "true" ]; then

From c315646093bf10f403d1c4c3842c8999e1fa0347 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Wed, 10 Jun 2026 11:06:20 -0700
Subject: [PATCH 2/7] Add perf-changelog entry for B300 DSv4 TRT image + recipe
 sync

Covers the dsv4-fp4-b300-trt / -mtp image bump to feat-deepseek_v4-c185066
and the B300 agg frontier recipe sync (PR #1703, B300 analog of #1699).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 perf-changelog.yaml | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 5622173f1..bb6b4dd84 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3531,3 +3531,13 @@
     - "The Rust frontend replaces only the Python serving/API layer (HTTP, tokenization, scheduling glue, detokenization) and spawns the same Python EngineCore, so GPU kernels/attention/MoE GEMM/KV cache are untouched"
     - "A/B sweep (28 single-node points, 1k1k + 8k1k, TP 1/2/4) vs the Python-frontend baseline (run 26696260751): throughput Pareto-neutral (peak tok/s/GPU within <1.5%, frontiers coincident) and TPOT flat (+-0.5%); TTFT improves ~8% at 1k1k and ~22% at 8k1k (every point), the expected signature of lower frontend CPU latency before first token, scaling with input length"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1634
+
+- config-keys:
+    - dsv4-fp4-b300-trt
+    - dsv4-fp4-b300-trt-mtp
+  description:
+    - "Update the B300 TensorRT-LLM DeepSeek-V4-Pro image to ghcr.io/semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-c185066"
+    - "B300 analog of PR #1699 (B200): sync the dsv4-fp4-b300-trt and dsv4-fp4-b300-trt-mtp recipes with the agg frontier config (worker GC off, NCCL graph mixing off, mimalloc/PyTorch alloc tweaks, higher KV cache fractions by DP path, stream_interval 100, use_low_precision_moe_combine, DP batching_wait_iters 30, max_num_tokens drops the OSL term)"
+    - "MTP recipe uses max_draft_len with a variable default draft length, enable_lm_head_tp_in_adp on the DP-attn path, and removes timeout_iters from the DP config"
+    - "B300-specific bits preserved (MODEL_PATH download block, TRTLLM_MHC_ENABLE_FUSED_HC=1, trtllm-serve MODEL_PATH); search space left as-is since the B300 sweeps already cover the high-concurrency regime"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1703

From 1e548b9860d3ad83364b1f17612e09b13370d4fa Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Wed, 10 Jun 2026 22:00:40 -0700
Subject: [PATCH 3/7] Trim dsv4-fp4-b300-trt 8k1k conc to max 256

Cap the 8k1k tp8/ep8 DP-attn sweep at conc 256 (was 256-1024) for
dsv4-fp4-b300-trt. trt-mtp and the 1k1k sweep are unchanged.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .github/configs/nvidia-master.yaml | 2 +-
 perf-changelog.yaml                | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 9e0200d1f..fff633af6 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -3069,7 +3069,7 @@ dsv4-fp4-b300-trt:
       search-space:
       - { tp: 4, conc-start: 1, conc-end: 32 }
       - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 64 }
-      - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 1024 }
+      - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 256 }
 
 dsv4-fp4-b300-trt-mtp:
   image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-c185066
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index bb6b4dd84..5edbaed5a 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3539,5 +3539,5 @@
     - "Update the B300 TensorRT-LLM DeepSeek-V4-Pro image to ghcr.io/semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-c185066"
     - "B300 analog of PR #1699 (B200): sync the dsv4-fp4-b300-trt and dsv4-fp4-b300-trt-mtp recipes with the agg frontier config (worker GC off, NCCL graph mixing off, mimalloc/PyTorch alloc tweaks, higher KV cache fractions by DP path, stream_interval 100, use_low_precision_moe_combine, DP batching_wait_iters 30, max_num_tokens drops the OSL term)"
     - "MTP recipe uses max_draft_len with a variable default draft length, enable_lm_head_tp_in_adp on the DP-attn path, and removes timeout_iters from the DP config"
-    - "B300-specific bits preserved (MODEL_PATH download block, TRTLLM_MHC_ENABLE_FUSED_HC=1, trtllm-serve MODEL_PATH); search space left as-is since the B300 sweeps already cover the high-concurrency regime"
+    - "B300-specific bits preserved (MODEL_PATH download block, TRTLLM_MHC_ENABLE_FUSED_HC=1, trtllm-serve MODEL_PATH); 1k1k search space left as-is, 8k1k dsv4-fp4-b300-trt conc-end trimmed from 1024 to 256 on the tp8/ep8 DP-attn row"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1703

From d1cf7a4778261bf4420dfbed7cfc87c158cf5ed5 Mon Sep 17 00:00:00 2001
From: Bryan Shan <58582368+Oseltamivir@users.noreply.github.com>
Date: Wed, 10 Jun 2026 22:02:24 -0700
Subject: [PATCH 4/7] Update perf-changelog.yaml

---
 perf-changelog.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index c89bd32a4..9d16955ab 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3533,7 +3533,6 @@
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1634
 
 - config-keys:
-<<<<<<< sync-dsv4-fp4-b300-trt-0608-config
     - dsv4-fp4-b300-trt
     - dsv4-fp4-b300-trt-mtp
   description:
@@ -3542,7 +3541,8 @@
     - "MTP recipe uses max_draft_len with a variable default draft length, enable_lm_head_tp_in_adp on the DP-attn path, and removes timeout_iters from the DP config"
     - "B300-specific bits preserved (MODEL_PATH download block, TRTLLM_MHC_ENABLE_FUSED_HC=1, trtllm-serve MODEL_PATH); 1k1k search space left as-is, 8k1k dsv4-fp4-b300-trt conc-end trimmed from 1024 to 256 on the tp8/ep8 DP-attn row"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1703
-=======
+
+- config-keys:
     - dsv4-fp4-b300-sglang-mtp
   description:
     - "Align MTP env vars to GB300: replace PRECOMPILE=0 with FAST_WARMUP=1, add RADIX_FORCE_MISS, DEFAULT_THINKING, DSV4_REASONING_EFFORT=max"

From a5b4fd492e75713b2f33930cb1148b1802dbde75 Mon Sep 17 00:00:00 2001
From: Bryan Shan <58582368+Oseltamivir@users.noreply.github.com>
Date: Wed, 10 Jun 2026 22:03:24 -0700
Subject: [PATCH 5/7] Update perf-changelog.yaml

---
 perf-changelog.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 9d16955ab..421a4bd47 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3557,4 +3557,3 @@
     - "MI355x DSR1-FP4: Include TP4 configurations for 8k1k"
     - "Expand the TP sweep (included TP=4) for 8k/1k configuration for conc=4 to 64"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1692
->>>>>>> main

From dbd6084ad6464a4235822823e2ee56a81b1c613d Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Thu, 11 Jun 2026 09:23:52 -0700
Subject: [PATCH 6/7] Cap B300 DSv4 TRT cudagraph warmup at batch 1024; revert
 8k1k conc trim

Revert the dsv4-fp4-b300-trt 8k1k conc-end trim (back to 1024) and instead
cap cuda_graph_config.max_batch_size at 1024 on both b300-trt and
b300-trt-mtp.

TRTLLM_MLA_EXTRA_OVERLAP hands MLA prologue tensors across CUDA streams
without record_stream(), so CUDA-graph warmup at decode batch >1024
(repros at 1088, e.g. tp8/ep8 dp-attn conc-2048 on B300) use-after-frees
into CUDA_ERROR_ILLEGAL_ADDRESS. Capping graph capture at 1024 avoids
warming up the >1024 graph; runtime --max_batch_size stays = CONC, so
batches >1024 run eager. Workaround until NVIDIA/TensorRT-LLM#15265 ships
in the image.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .github/configs/nvidia-master.yaml                        | 2 +-
 benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt.sh | 8 +++++++-
 .../single_node/fixed_seq_len/dsv4_fp4_b300_trt_mtp.sh    | 8 +++++++-
 perf-changelog.yaml                                       | 3 ++-
 4 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 106801c93..aec1bc95c 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -3069,7 +3069,7 @@ dsv4-fp4-b300-trt:
       search-space:
       - { tp: 4, conc-start: 1, conc-end: 32 }
       - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 64 }
-      - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 256 }
+      - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 1024 }
 
 dsv4-fp4-b300-trt-mtp:
   image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-c185066
diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt.sh
index b23ab7a07..bcd1fbf6a 100644
--- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt.sh
@@ -78,7 +78,13 @@ else
     MOE_BACKEND="${MOE_BACKEND:-TRTLLM}"
 fi
 MAX_BATCH_SIZE=$(( CONC > 16 ? CONC : 16 ))
-CUDA_GRAPH_MAX_BATCH_SIZE="$MAX_BATCH_SIZE"
+# Cap CUDA-graph capture at batch 1024. TRTLLM_MLA_EXTRA_OVERLAP hands MLA
+# prologue tensors across streams without record_stream(), so graph warmup at
+# decode batch >1024 (repros at 1088, e.g. tp8/ep8 dp-attn conc-2048 on B300)
+# hits a use-after-free -> CUDA_ERROR_ILLEGAL_ADDRESS. Fixed upstream in
+# NVIDIA/TensorRT-LLM#15265; cap until that fix ships in the image. Runtime
+# --max_batch_size stays = CONC, so batches >1024 just run eager.
+CUDA_GRAPH_MAX_BATCH_SIZE=$(( MAX_BATCH_SIZE < 1024 ? MAX_BATCH_SIZE : 1024 ))
 if [[ "$DP_ATTENTION" == "true" ]]; then
     KV_CACHE_FREE_MEM_FRACTION="${KV_CACHE_FREE_MEM_FRACTION:-0.7}"
 else
diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt_mtp.sh
index 11a6c7b9c..bb0362c25 100644
--- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt_mtp.sh
@@ -84,7 +84,13 @@ else
     MTP="${TRTLLM_DSV4_MTP_NUM_NEXTN_LAYERS:-3}"
 fi
 MAX_BATCH_SIZE=$(( CONC > 16 ? CONC : 16 ))
-CUDA_GRAPH_MAX_BATCH_SIZE="$MAX_BATCH_SIZE"
+# Cap CUDA-graph capture at batch 1024. TRTLLM_MLA_EXTRA_OVERLAP hands MLA
+# prologue tensors across streams without record_stream(), so graph warmup at
+# decode batch >1024 (repros at 1088, e.g. tp8/ep8 dp-attn conc-2048 on B300)
+# hits a use-after-free -> CUDA_ERROR_ILLEGAL_ADDRESS. Fixed upstream in
+# NVIDIA/TensorRT-LLM#15265; cap until that fix ships in the image. Runtime
+# --max_batch_size stays = CONC, so batches >1024 just run eager.
+CUDA_GRAPH_MAX_BATCH_SIZE=$(( MAX_BATCH_SIZE < 1024 ? MAX_BATCH_SIZE : 1024 ))
 if [[ "$DP_ATTENTION" == "true" ]]; then
     KV_CACHE_FREE_MEM_FRACTION="${KV_CACHE_FREE_MEM_FRACTION:-0.6}"
 else
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 421a4bd47..0317da921 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3539,7 +3539,8 @@
     - "Update the B300 TensorRT-LLM DeepSeek-V4-Pro image to ghcr.io/semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-c185066"
     - "B300 analog of PR #1699 (B200): sync the dsv4-fp4-b300-trt and dsv4-fp4-b300-trt-mtp recipes with the agg frontier config (worker GC off, NCCL graph mixing off, mimalloc/PyTorch alloc tweaks, higher KV cache fractions by DP path, stream_interval 100, use_low_precision_moe_combine, DP batching_wait_iters 30, max_num_tokens drops the OSL term)"
     - "MTP recipe uses max_draft_len with a variable default draft length, enable_lm_head_tp_in_adp on the DP-attn path, and removes timeout_iters from the DP config"
-    - "B300-specific bits preserved (MODEL_PATH download block, TRTLLM_MHC_ENABLE_FUSED_HC=1, trtllm-serve MODEL_PATH); 1k1k search space left as-is, 8k1k dsv4-fp4-b300-trt conc-end trimmed from 1024 to 256 on the tp8/ep8 DP-attn row"
+    - "Cap cuda_graph_config.max_batch_size at 1024 on both recipes: TRTLLM_MLA_EXTRA_OVERLAP hands MLA prologue tensors across streams without record_stream(), so CUDA-graph warmup at decode batch >1024 (repros at 1088, e.g. tp8/ep8 dp-attn conc-2048 on B300) use-after-frees into CUDA_ERROR_ILLEGAL_ADDRESS; workaround until NVIDIA/TensorRT-LLM#15265 ships in the image. Runtime --max_batch_size stays = CONC, so batches >1024 run eager"
+    - "B300-specific bits preserved (MODEL_PATH download block, TRTLLM_MHC_ENABLE_FUSED_HC=1, trtllm-serve MODEL_PATH); search space left as-is since the B300 sweeps already cover the high-concurrency regime"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1703
 
 - config-keys:

From 9f02c5dce9091fd815615e503e081fdf20b2bfc3 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Thu, 11 Jun 2026 11:54:32 -0700
Subject: [PATCH 7/7] Drop 1k1k conc-2048 point from B300 DSv4 TRT sweeps

Remove the conc=2048 point on the 1k1k tp8/ep8 DP-attn row for both
dsv4-fp4-b300-trt and dsv4-fp4-b300-trt-mtp (now 512-1024). This is the
batch regime that triggers the MLA-overlap warmup crash (NVIDIA/TensorRT-LLM#15265);
the cudagraph cap at 1024 stays as a safety net. 8k1k unchanged.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .github/configs/nvidia-master.yaml | 4 ++--
 perf-changelog.yaml                | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index e7bc8b348..25777616d 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -3063,7 +3063,7 @@ dsv4-fp4-b300-trt:
       search-space:
       - { tp: 4, conc-start: 1, conc-end: 64 }
       - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 256 }
-      - { tp: 8, ep: 8, dp-attn: true, conc-start: 512, conc-end: 2048 }
+      - { tp: 8, ep: 8, dp-attn: true, conc-start: 512, conc-end: 1024 }
     - isl: 8192
       osl: 1024
       search-space:
@@ -3086,7 +3086,7 @@ dsv4-fp4-b300-trt-mtp:
       search-space:
       - { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp }
       - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 256, spec-decoding: mtp }
-      - { tp: 8, ep: 8, dp-attn: true, conc-start: 512, conc-end: 2048, spec-decoding: mtp }
+      - { tp: 8, ep: 8, dp-attn: true, conc-start: 512, conc-end: 1024, spec-decoding: mtp }
     - isl: 8192
       osl: 1024
       search-space:
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 90a305489..e54dc7b6e 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3551,7 +3551,7 @@
     - "B300 analog of PR #1699 (B200): sync the dsv4-fp4-b300-trt and dsv4-fp4-b300-trt-mtp recipes with the agg frontier config (worker GC off, NCCL graph mixing off, mimalloc/PyTorch alloc tweaks, higher KV cache fractions by DP path, stream_interval 100, use_low_precision_moe_combine, DP batching_wait_iters 30, max_num_tokens drops the OSL term)"
     - "MTP recipe uses max_draft_len with a variable default draft length, enable_lm_head_tp_in_adp on the DP-attn path, and removes timeout_iters from the DP config"
     - "Cap cuda_graph_config.max_batch_size at 1024 on both recipes: TRTLLM_MLA_EXTRA_OVERLAP hands MLA prologue tensors across streams without record_stream(), so CUDA-graph warmup at decode batch >1024 (repros at 1088, e.g. tp8/ep8 dp-attn conc-2048 on B300) use-after-frees into CUDA_ERROR_ILLEGAL_ADDRESS; workaround until NVIDIA/TensorRT-LLM#15265 ships in the image. Runtime --max_batch_size stays = CONC, so batches >1024 run eager"
-    - "B300-specific bits preserved (MODEL_PATH download block, TRTLLM_MHC_ENABLE_FUSED_HC=1, trtllm-serve MODEL_PATH); search space left as-is since the B300 sweeps already cover the high-concurrency regime"
+    - "B300-specific bits preserved (MODEL_PATH download block, TRTLLM_MHC_ENABLE_FUSED_HC=1, trtllm-serve MODEL_PATH); drop the 1k1k conc-2048 point on the tp8/ep8 DP-attn row (both recipes), the batch regime that triggers the MLA-overlap crash above; rest of the search space unchanged"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1703
 
 - config-keys: