From d69402d0d13925cc5f153fd16d2cbd034abada2e Mon Sep 17 00:00:00 2001
From: yhyang201 <yhyang201@gmail.com>
Date: Wed, 10 Jun 2026 17:24:41 +0800
Subject: [PATCH 1/6] dsv4-fp4-b300-sglang-mtp: align env vars to GB300, bump
 image

---
 .github/configs/nvidia-master.yaml            |  2 +-
 .../fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh | 29 +++++++++----------
 2 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index a02749d4d..3e50f4940 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -2043,7 +2043,7 @@ dsv4-fp4-b300-sglang:
   #   dp-attn: true  -> DP-attn  + flashinfer_mxfp4 + chunked-prefill 32768
   #                     + EAGLE (1,1,2) + mem-fraction 0.92 + max-running 256
 dsv4-fp4-b300-sglang-mtp:
-  image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3
+  image: lmsysorg/sglang:nightly-dev-cu13-20260610-f332e526
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: b300
diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh
index 672d48f4b..9c8bab961 100755
--- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh
@@ -41,13 +41,12 @@ fi
 
 nvidia-smi
 
-# Common SGLANG env vars (apply to every config).
-export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0
+# Common SGLANG env vars.
+export SGLANG_JIT_DEEPGEMM_FAST_WARMUP=1
+export SGLANG_RADIX_FORCE_MISS=1
+export SGLANG_DEFAULT_THINKING=1
+export SGLANG_DSV4_REASONING_EFFORT=max
 export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1
-export SGLANG_OPT_USE_JIT_NORM=1
-export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1
-export SGLANG_OPT_USE_TOPK_V2=1
-export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1
 
 # TODO(Cam): the deepseek-v4 sglang images install sglang editable at
 # /workspace/sglang/python; prior sglang tags used /sgl-workspace/sglang.
@@ -75,18 +74,16 @@ DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}
 if [ "${DP_ATTENTION}" = "true" ]; then
     # DP-attn path: flashinfer_mxfp4 + DP-attn (covers conc 16-256).
     export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1
-    export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=0
-    export SGLANG_OPT_FIX_HASH_MEGA_MOE=0
-    export SGLANG_OPT_USE_FAST_MASK_EP=1
-    export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1
-    export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096
-    export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1
-    export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0
+    export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1
+    export SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_FP4_ACTS=1
+    export SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_MXF4_KIND=1
+    export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8192
+    export SGLANG_REQUEST_STATE_WAIT_TIMEOUT=60
     SPEC_FLAGS=(
         --speculative-algorithm EAGLE
-        --speculative-num-steps 1
+        --speculative-num-steps 3
         --speculative-eagle-topk 1
-        --speculative-num-draft-tokens 2
+        --speculative-num-draft-tokens 4
     )
     PARALLEL_ARGS=(
         --dp-size "$TP"
@@ -95,6 +92,7 @@ if [ "${DP_ATTENTION}" = "true" ]; then
         --disable-flashinfer-autotune
         --deepep-config "$DEEPEP_CONFIG"
         --cuda-graph-max-bs 256
+        --enable-deepseek-v4-fp4-indexer
     )
     CHUNKED_PREFILL_SIZE=32768
     MEM_FRACTION_STATIC=0.92
@@ -110,6 +108,7 @@ else
     PARALLEL_ARGS=(
         --moe-runner-backend flashinfer_mxfp4
         --disable-flashinfer-autotune
+        --enable-deepseek-v4-fp4-indexer
     )
     CHUNKED_PREFILL_SIZE=8192
     MEM_FRACTION_STATIC=0.90

From c4db16797ec4eccd9934a6f97f7d9b9b5c60d985 Mon Sep 17 00:00:00 2001
From: yhyang201 <yhyang201@gmail.com>
Date: Wed, 10 Jun 2026 17:25:26 +0800
Subject: [PATCH 2/6] perf-changelog: add entry for MTP GB300 alignment PR
 #1700

---
 perf-changelog.yaml | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 5622173f1..d9575935f 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3531,3 +3531,12 @@
     - "The Rust frontend replaces only the Python serving/API layer (HTTP, tokenization, scheduling glue, detokenization) and spawns the same Python EngineCore, so GPU kernels/attention/MoE GEMM/KV cache are untouched"
     - "A/B sweep (28 single-node points, 1k1k + 8k1k, TP 1/2/4) vs the Python-frontend baseline (run 26696260751): throughput Pareto-neutral (peak tok/s/GPU within <1.5%, frontiers coincident) and TPOT flat (+-0.5%); TTFT improves ~8% at 1k1k and ~22% at 8k1k (every point), the expected signature of lower frontend CPU latency before first token, scaling with input length"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1634
+
+- config-keys:
+    - dsv4-fp4-b300-sglang-mtp
+  description:
+    - "Align MTP env vars to GB300: replace PRECOMPILE=0 with FAST_WARMUP=1, add RADIX_FORCE_MISS, DEFAULT_THINKING, DSV4_REASONING_EFFORT=max"
+    - "Replace DP-attn env vars with shared GB300 block: MEGA_MOE_USE_FP4_ACTS, USE_MXF4_KIND, NUM_MAX_TOKENS_PER_RANK=8192"
+    - "Unify EAGLE spec-decoding to (3,1,4) for both DP-attn and TP-only paths, add --enable-deepseek-v4-fp4-indexer"
+    - "Bump image to nightly-dev-cu13-20260610-f332e526"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1700

From 3204bd541b9f2a5cc96001311afe2e367703bcd4 Mon Sep 17 00:00:00 2001
From: yhyang201 <yhyang201@gmail.com>
Date: Wed, 10 Jun 2026 23:06:15 +0800
Subject: [PATCH 3/6] dsv4-fp4-b300-sglang-mtp: add piecewise cuda graph flags

---
 .../single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh   | 6 ++++++
 perf-changelog.yaml                                         | 6 ++++++
 2 files changed, 12 insertions(+)

diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh
index 9c8bab961..328b8f970 100755
--- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh
@@ -93,6 +93,9 @@ if [ "${DP_ATTENTION}" = "true" ]; then
         --deepep-config "$DEEPEP_CONFIG"
         --cuda-graph-max-bs 256
         --enable-deepseek-v4-fp4-indexer
+        --enforce-piecewise-cuda-graph
+        --enable-mixed-chunk
+        --enable-breakable-cuda-graph
     )
     CHUNKED_PREFILL_SIZE=32768
     MEM_FRACTION_STATIC=0.92
@@ -109,6 +112,9 @@ else
         --moe-runner-backend flashinfer_mxfp4
         --disable-flashinfer-autotune
         --enable-deepseek-v4-fp4-indexer
+        --enforce-piecewise-cuda-graph
+        --enable-mixed-chunk
+        --enable-breakable-cuda-graph
     )
     CHUNKED_PREFILL_SIZE=8192
     MEM_FRACTION_STATIC=0.90
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index d9575935f..c058f7eb7 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3540,3 +3540,9 @@
     - "Unify EAGLE spec-decoding to (3,1,4) for both DP-attn and TP-only paths, add --enable-deepseek-v4-fp4-indexer"
     - "Bump image to nightly-dev-cu13-20260610-f332e526"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1700
+
+- config-keys:
+    - dsv4-fp4-b300-sglang-mtp
+  description:
+    - "Add --enforce-piecewise-cuda-graph, --enable-mixed-chunk, --enable-breakable-cuda-graph to both DP-attn and TP-only launch profiles"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1702

From 81b75bff4cacf8fb68cf75aa8447759b30a2db11 Mon Sep 17 00:00:00 2001
From: yhyang201 <yhyang201@gmail.com>
Date: Wed, 10 Jun 2026 23:25:18 +0800
Subject: [PATCH 4/6] perf-changelog: keep only #1702 entry, drop #1700
 (belongs in its own PR)

---
 perf-changelog.yaml | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index c058f7eb7..6329bb9a5 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3532,15 +3532,6 @@
     - "A/B sweep (28 single-node points, 1k1k + 8k1k, TP 1/2/4) vs the Python-frontend baseline (run 26696260751): throughput Pareto-neutral (peak tok/s/GPU within <1.5%, frontiers coincident) and TPOT flat (+-0.5%); TTFT improves ~8% at 1k1k and ~22% at 8k1k (every point), the expected signature of lower frontend CPU latency before first token, scaling with input length"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1634
 
-- config-keys:
-    - dsv4-fp4-b300-sglang-mtp
-  description:
-    - "Align MTP env vars to GB300: replace PRECOMPILE=0 with FAST_WARMUP=1, add RADIX_FORCE_MISS, DEFAULT_THINKING, DSV4_REASONING_EFFORT=max"
-    - "Replace DP-attn env vars with shared GB300 block: MEGA_MOE_USE_FP4_ACTS, USE_MXF4_KIND, NUM_MAX_TOKENS_PER_RANK=8192"
-    - "Unify EAGLE spec-decoding to (3,1,4) for both DP-attn and TP-only paths, add --enable-deepseek-v4-fp4-indexer"
-    - "Bump image to nightly-dev-cu13-20260610-f332e526"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1700
-
 - config-keys:
     - dsv4-fp4-b300-sglang-mtp
   description:

From 4085b1ffe8f56a51ad1c28847a6851ee6b5115eb Mon Sep 17 00:00:00 2001
From: yhyang201 <yhyang201@gmail.com>
Date: Wed, 10 Jun 2026 23:42:27 +0800
Subject: [PATCH 5/6] trigger CI re-run


From 753ce6cc7a5b15ed5697fcde8f7dad9afd3baf4c Mon Sep 17 00:00:00 2001
From: yhyang201 <yhyang201@gmail.com>
Date: Wed, 10 Jun 2026 23:59:02 +0800
Subject: [PATCH 6/6] re-trigger CI