From d69402d0d13925cc5f153fd16d2cbd034abada2e Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Wed, 10 Jun 2026 17:24:41 +0800 Subject: [PATCH 1/6] dsv4-fp4-b300-sglang-mtp: align env vars to GB300, bump image --- .github/configs/nvidia-master.yaml | 2 +- .../fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh | 29 +++++++++---------- 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index a02749d4d..3e50f4940 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2043,7 +2043,7 @@ dsv4-fp4-b300-sglang: # dp-attn: true -> DP-attn + flashinfer_mxfp4 + chunked-prefill 32768 # + EAGLE (1,1,2) + mem-fraction 0.92 + max-running 256 dsv4-fp4-b300-sglang-mtp: - image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3 + image: lmsysorg/sglang:nightly-dev-cu13-20260610-f332e526 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b300 diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh index 672d48f4b..9c8bab961 100755 --- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh @@ -41,13 +41,12 @@ fi nvidia-smi -# Common SGLANG env vars (apply to every config). -export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 +# Common SGLANG env vars. +export SGLANG_JIT_DEEPGEMM_FAST_WARMUP=1 +export SGLANG_RADIX_FORCE_MISS=1 +export SGLANG_DEFAULT_THINKING=1 +export SGLANG_DSV4_REASONING_EFFORT=max export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1 -export SGLANG_OPT_USE_JIT_NORM=1 -export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1 -export SGLANG_OPT_USE_TOPK_V2=1 -export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 # TODO(Cam): the deepseek-v4 sglang images install sglang editable at # /workspace/sglang/python; prior sglang tags used /sgl-workspace/sglang. @@ -75,18 +74,16 @@ DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96} if [ "${DP_ATTENTION}" = "true" ]; then # DP-attn path: flashinfer_mxfp4 + DP-attn (covers conc 16-256). export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1 - export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=0 - export SGLANG_OPT_FIX_HASH_MEGA_MOE=0 - export SGLANG_OPT_USE_FAST_MASK_EP=1 - export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1 - export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096 - export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1 - export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0 + export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1 + export SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_FP4_ACTS=1 + export SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_MXF4_KIND=1 + export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8192 + export SGLANG_REQUEST_STATE_WAIT_TIMEOUT=60 SPEC_FLAGS=( --speculative-algorithm EAGLE - --speculative-num-steps 1 + --speculative-num-steps 3 --speculative-eagle-topk 1 - --speculative-num-draft-tokens 2 + --speculative-num-draft-tokens 4 ) PARALLEL_ARGS=( --dp-size "$TP" @@ -95,6 +92,7 @@ if [ "${DP_ATTENTION}" = "true" ]; then --disable-flashinfer-autotune --deepep-config "$DEEPEP_CONFIG" --cuda-graph-max-bs 256 + --enable-deepseek-v4-fp4-indexer ) CHUNKED_PREFILL_SIZE=32768 MEM_FRACTION_STATIC=0.92 @@ -110,6 +108,7 @@ else PARALLEL_ARGS=( --moe-runner-backend flashinfer_mxfp4 --disable-flashinfer-autotune + --enable-deepseek-v4-fp4-indexer ) CHUNKED_PREFILL_SIZE=8192 MEM_FRACTION_STATIC=0.90 From c4db16797ec4eccd9934a6f97f7d9b9b5c60d985 Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Wed, 10 Jun 2026 17:25:26 +0800 Subject: [PATCH 2/6] perf-changelog: add entry for MTP GB300 alignment PR #1700 --- perf-changelog.yaml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 5622173f1..d9575935f 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3531,3 +3531,12 @@ - "The Rust frontend replaces only the Python serving/API layer (HTTP, tokenization, scheduling glue, detokenization) and spawns the same Python EngineCore, so GPU kernels/attention/MoE GEMM/KV cache are untouched" - "A/B sweep (28 single-node points, 1k1k + 8k1k, TP 1/2/4) vs the Python-frontend baseline (run 26696260751): throughput Pareto-neutral (peak tok/s/GPU within <1.5%, frontiers coincident) and TPOT flat (+-0.5%); TTFT improves ~8% at 1k1k and ~22% at 8k1k (every point), the expected signature of lower frontend CPU latency before first token, scaling with input length" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1634 + +- config-keys: + - dsv4-fp4-b300-sglang-mtp + description: + - "Align MTP env vars to GB300: replace PRECOMPILE=0 with FAST_WARMUP=1, add RADIX_FORCE_MISS, DEFAULT_THINKING, DSV4_REASONING_EFFORT=max" + - "Replace DP-attn env vars with shared GB300 block: MEGA_MOE_USE_FP4_ACTS, USE_MXF4_KIND, NUM_MAX_TOKENS_PER_RANK=8192" + - "Unify EAGLE spec-decoding to (3,1,4) for both DP-attn and TP-only paths, add --enable-deepseek-v4-fp4-indexer" + - "Bump image to nightly-dev-cu13-20260610-f332e526" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1700 From 3204bd541b9f2a5cc96001311afe2e367703bcd4 Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Wed, 10 Jun 2026 23:06:15 +0800 Subject: [PATCH 3/6] dsv4-fp4-b300-sglang-mtp: add piecewise cuda graph flags --- .../single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh | 6 ++++++ perf-changelog.yaml | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh index 9c8bab961..328b8f970 100755 --- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh @@ -93,6 +93,9 @@ if [ "${DP_ATTENTION}" = "true" ]; then --deepep-config "$DEEPEP_CONFIG" --cuda-graph-max-bs 256 --enable-deepseek-v4-fp4-indexer + --enforce-piecewise-cuda-graph + --enable-mixed-chunk + --enable-breakable-cuda-graph ) CHUNKED_PREFILL_SIZE=32768 MEM_FRACTION_STATIC=0.92 @@ -109,6 +112,9 @@ else --moe-runner-backend flashinfer_mxfp4 --disable-flashinfer-autotune --enable-deepseek-v4-fp4-indexer + --enforce-piecewise-cuda-graph + --enable-mixed-chunk + --enable-breakable-cuda-graph ) CHUNKED_PREFILL_SIZE=8192 MEM_FRACTION_STATIC=0.90 diff --git a/perf-changelog.yaml b/perf-changelog.yaml index d9575935f..c058f7eb7 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3540,3 +3540,9 @@ - "Unify EAGLE spec-decoding to (3,1,4) for both DP-attn and TP-only paths, add --enable-deepseek-v4-fp4-indexer" - "Bump image to nightly-dev-cu13-20260610-f332e526" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1700 + +- config-keys: + - dsv4-fp4-b300-sglang-mtp + description: + - "Add --enforce-piecewise-cuda-graph, --enable-mixed-chunk, --enable-breakable-cuda-graph to both DP-attn and TP-only launch profiles" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1702 From 81b75bff4cacf8fb68cf75aa8447759b30a2db11 Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Wed, 10 Jun 2026 23:25:18 +0800 Subject: [PATCH 4/6] perf-changelog: keep only #1702 entry, drop #1700 (belongs in its own PR) --- perf-changelog.yaml | 9 --------- 1 file changed, 9 deletions(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index c058f7eb7..6329bb9a5 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3532,15 +3532,6 @@ - "A/B sweep (28 single-node points, 1k1k + 8k1k, TP 1/2/4) vs the Python-frontend baseline (run 26696260751): throughput Pareto-neutral (peak tok/s/GPU within <1.5%, frontiers coincident) and TPOT flat (+-0.5%); TTFT improves ~8% at 1k1k and ~22% at 8k1k (every point), the expected signature of lower frontend CPU latency before first token, scaling with input length" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1634 -- config-keys: - - dsv4-fp4-b300-sglang-mtp - description: - - "Align MTP env vars to GB300: replace PRECOMPILE=0 with FAST_WARMUP=1, add RADIX_FORCE_MISS, DEFAULT_THINKING, DSV4_REASONING_EFFORT=max" - - "Replace DP-attn env vars with shared GB300 block: MEGA_MOE_USE_FP4_ACTS, USE_MXF4_KIND, NUM_MAX_TOKENS_PER_RANK=8192" - - "Unify EAGLE spec-decoding to (3,1,4) for both DP-attn and TP-only paths, add --enable-deepseek-v4-fp4-indexer" - - "Bump image to nightly-dev-cu13-20260610-f332e526" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1700 - - config-keys: - dsv4-fp4-b300-sglang-mtp description: From 4085b1ffe8f56a51ad1c28847a6851ee6b5115eb Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Wed, 10 Jun 2026 23:42:27 +0800 Subject: [PATCH 5/6] trigger CI re-run From 753ce6cc7a5b15ed5697fcde8f7dad9afd3baf4c Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Wed, 10 Jun 2026 23:59:02 +0800 Subject: [PATCH 6/6] re-trigger CI