Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2043,7 +2043,7 @@ dsv4-fp4-b300-sglang:
# dp-attn: true -> DP-attn + flashinfer_mxfp4 + chunked-prefill 32768
# + EAGLE (1,1,2) + mem-fraction 0.92 + max-running 256
dsv4-fp4-b300-sglang-mtp:
image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3
image: lmsysorg/sglang:nightly-dev-cu13-20260610-f332e526
model: deepseek-ai/DeepSeek-V4-Pro
model-prefix: dsv4
runner: b300
Expand Down
35 changes: 20 additions & 15 deletions benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,12 @@ fi

nvidia-smi

# Common SGLANG env vars (apply to every config).
export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0
# Common SGLANG env vars.
export SGLANG_JIT_DEEPGEMM_FAST_WARMUP=1
export SGLANG_RADIX_FORCE_MISS=1
export SGLANG_DEFAULT_THINKING=1
export SGLANG_DSV4_REASONING_EFFORT=max
export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1
export SGLANG_OPT_USE_JIT_NORM=1
export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1
export SGLANG_OPT_USE_TOPK_V2=1
export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1

# TODO(Cam): the deepseek-v4 sglang images install sglang editable at
# /workspace/sglang/python; prior sglang tags used /sgl-workspace/sglang.
Expand Down Expand Up @@ -75,18 +74,16 @@ DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}
if [ "${DP_ATTENTION}" = "true" ]; then
# DP-attn path: flashinfer_mxfp4 + DP-attn (covers conc 16-256).
export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1
export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=0
export SGLANG_OPT_FIX_HASH_MEGA_MOE=0
export SGLANG_OPT_USE_FAST_MASK_EP=1
export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1
export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096
export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1
export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0
export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1
export SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_FP4_ACTS=1
export SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_MXF4_KIND=1
export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8192
export SGLANG_REQUEST_STATE_WAIT_TIMEOUT=60
SPEC_FLAGS=(
--speculative-algorithm EAGLE
--speculative-num-steps 1
--speculative-num-steps 3
--speculative-eagle-topk 1
--speculative-num-draft-tokens 2
--speculative-num-draft-tokens 4
)
PARALLEL_ARGS=(
--dp-size "$TP"
Expand All @@ -95,6 +92,10 @@ if [ "${DP_ATTENTION}" = "true" ]; then
--disable-flashinfer-autotune
--deepep-config "$DEEPEP_CONFIG"
--cuda-graph-max-bs 256
--enable-deepseek-v4-fp4-indexer
--enforce-piecewise-cuda-graph
--enable-mixed-chunk
--enable-breakable-cuda-graph
)
CHUNKED_PREFILL_SIZE=32768
MEM_FRACTION_STATIC=0.92
Expand All @@ -110,6 +111,10 @@ else
PARALLEL_ARGS=(
--moe-runner-backend flashinfer_mxfp4
--disable-flashinfer-autotune
--enable-deepseek-v4-fp4-indexer
--enforce-piecewise-cuda-graph
--enable-mixed-chunk
--enable-breakable-cuda-graph
)
CHUNKED_PREFILL_SIZE=8192
MEM_FRACTION_STATIC=0.90
Expand Down
6 changes: 6 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3531,3 +3531,9 @@
- "The Rust frontend replaces only the Python serving/API layer (HTTP, tokenization, scheduling glue, detokenization) and spawns the same Python EngineCore, so GPU kernels/attention/MoE GEMM/KV cache are untouched"
- "A/B sweep (28 single-node points, 1k1k + 8k1k, TP 1/2/4) vs the Python-frontend baseline (run 26696260751): throughput Pareto-neutral (peak tok/s/GPU within <1.5%, frontiers coincident) and TPOT flat (+-0.5%); TTFT improves ~8% at 1k1k and ~22% at 8k1k (every point), the expected signature of lower frontend CPU latency before first token, scaling with input length"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1634

- config-keys:
- dsv4-fp4-b300-sglang-mtp
description:
- "Add --enforce-piecewise-cuda-graph, --enable-mixed-chunk, --enable-breakable-cuda-graph to both DP-attn and TP-only launch profiles"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1702
Loading