SemiAnalysisAI · yhyang201 · Jun 10, 2026 · Jun 10, 2026 · Jun 10, 2026 · Jun 10, 2026
@@ -2043,7 +2043,7 @@ dsv4-fp4-b300-sglang:
   #   dp-attn: true  -> DP-attn  + flashinfer_mxfp4 + chunked-prefill 32768
   #                     + EAGLE (1,1,2) + mem-fraction 0.92 + max-running 256
 dsv4-fp4-b300-sglang-mtp:
-  image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3
+  image: lmsysorg/sglang:nightly-dev-cu13-20260610-f332e526
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: b300

diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh
@@ -41,13 +41,12 @@ fi
 
 nvidia-smi
 
-# Common SGLANG env vars (apply to every config).
-export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0
+# Common SGLANG env vars.
+export SGLANG_JIT_DEEPGEMM_FAST_WARMUP=1
+export SGLANG_RADIX_FORCE_MISS=1
+export SGLANG_DEFAULT_THINKING=1
+export SGLANG_DSV4_REASONING_EFFORT=max
 export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1
-export SGLANG_OPT_USE_JIT_NORM=1
-export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1
-export SGLANG_OPT_USE_TOPK_V2=1
-export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1
 
 # TODO(Cam): the deepseek-v4 sglang images install sglang editable at
 # /workspace/sglang/python; prior sglang tags used /sgl-workspace/sglang.
@@ -75,18 +74,16 @@ DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}
 if [ "${DP_ATTENTION}" = "true" ]; then
     # DP-attn path: flashinfer_mxfp4 + DP-attn (covers conc 16-256).
     export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1
-    export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=0
-    export SGLANG_OPT_FIX_HASH_MEGA_MOE=0
-    export SGLANG_OPT_USE_FAST_MASK_EP=1
-    export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1
-    export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096
-    export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1
-    export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0
+    export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1
+    export SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_FP4_ACTS=1
+    export SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_MXF4_KIND=1
+    export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8192
+    export SGLANG_REQUEST_STATE_WAIT_TIMEOUT=60
     SPEC_FLAGS=(
         --speculative-algorithm EAGLE
-        --speculative-num-steps 1
+        --speculative-num-steps 3
         --speculative-eagle-topk 1
-        --speculative-num-draft-tokens 2
+        --speculative-num-draft-tokens 4
     )
     PARALLEL_ARGS=(
         --dp-size "$TP"
@@ -95,6 +92,10 @@ if [ "${DP_ATTENTION}" = "true" ]; then
         --disable-flashinfer-autotune
         --deepep-config "$DEEPEP_CONFIG"
         --cuda-graph-max-bs 256
+        --enable-deepseek-v4-fp4-indexer
+        --enforce-piecewise-cuda-graph
+        --enable-mixed-chunk
+        --enable-breakable-cuda-graph
     )
     CHUNKED_PREFILL_SIZE=32768
     MEM_FRACTION_STATIC=0.92
@@ -110,6 +111,10 @@ else
     PARALLEL_ARGS=(
         --moe-runner-backend flashinfer_mxfp4
         --disable-flashinfer-autotune
+        --enable-deepseek-v4-fp4-indexer
+        --enforce-piecewise-cuda-graph
+        --enable-mixed-chunk
+        --enable-breakable-cuda-graph
     )
     CHUNKED_PREFILL_SIZE=8192
     MEM_FRACTION_STATIC=0.90

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -3531,3 +3531,9 @@
     - "The Rust frontend replaces only the Python serving/API layer (HTTP, tokenization, scheduling glue, detokenization) and spawns the same Python EngineCore, so GPU kernels/attention/MoE GEMM/KV cache are untouched"
     - "A/B sweep (28 single-node points, 1k1k + 8k1k, TP 1/2/4) vs the Python-frontend baseline (run 26696260751): throughput Pareto-neutral (peak tok/s/GPU within <1.5%, frontiers coincident) and TPOT flat (+-0.5%); TTFT improves ~8% at 1k1k and ~22% at 8k1k (every point), the expected signature of lower frontend CPU latency before first token, scaling with input length"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1634
+
+- config-keys:
+    - dsv4-fp4-b300-sglang-mtp
+  description:
+    - "Add --enforce-piecewise-cuda-graph, --enable-mixed-chunk, --enable-breakable-cuda-graph to both DP-attn and TP-only launch profiles"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1702