diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index a02749d4d..3e50f4940 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2043,7 +2043,7 @@ dsv4-fp4-b300-sglang: # dp-attn: true -> DP-attn + flashinfer_mxfp4 + chunked-prefill 32768 # + EAGLE (1,1,2) + mem-fraction 0.92 + max-running 256 dsv4-fp4-b300-sglang-mtp: - image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3 + image: lmsysorg/sglang:nightly-dev-cu13-20260610-f332e526 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b300 diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh index 672d48f4b..328b8f970 100755 --- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh @@ -41,13 +41,12 @@ fi nvidia-smi -# Common SGLANG env vars (apply to every config). -export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 +# Common SGLANG env vars. +export SGLANG_JIT_DEEPGEMM_FAST_WARMUP=1 +export SGLANG_RADIX_FORCE_MISS=1 +export SGLANG_DEFAULT_THINKING=1 +export SGLANG_DSV4_REASONING_EFFORT=max export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1 -export SGLANG_OPT_USE_JIT_NORM=1 -export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1 -export SGLANG_OPT_USE_TOPK_V2=1 -export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 # TODO(Cam): the deepseek-v4 sglang images install sglang editable at # /workspace/sglang/python; prior sglang tags used /sgl-workspace/sglang. @@ -75,18 +74,16 @@ DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96} if [ "${DP_ATTENTION}" = "true" ]; then # DP-attn path: flashinfer_mxfp4 + DP-attn (covers conc 16-256). export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1 - export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=0 - export SGLANG_OPT_FIX_HASH_MEGA_MOE=0 - export SGLANG_OPT_USE_FAST_MASK_EP=1 - export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1 - export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096 - export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1 - export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0 + export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1 + export SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_FP4_ACTS=1 + export SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_MXF4_KIND=1 + export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8192 + export SGLANG_REQUEST_STATE_WAIT_TIMEOUT=60 SPEC_FLAGS=( --speculative-algorithm EAGLE - --speculative-num-steps 1 + --speculative-num-steps 3 --speculative-eagle-topk 1 - --speculative-num-draft-tokens 2 + --speculative-num-draft-tokens 4 ) PARALLEL_ARGS=( --dp-size "$TP" @@ -95,6 +92,10 @@ if [ "${DP_ATTENTION}" = "true" ]; then --disable-flashinfer-autotune --deepep-config "$DEEPEP_CONFIG" --cuda-graph-max-bs 256 + --enable-deepseek-v4-fp4-indexer + --enforce-piecewise-cuda-graph + --enable-mixed-chunk + --enable-breakable-cuda-graph ) CHUNKED_PREFILL_SIZE=32768 MEM_FRACTION_STATIC=0.92 @@ -110,6 +111,10 @@ else PARALLEL_ARGS=( --moe-runner-backend flashinfer_mxfp4 --disable-flashinfer-autotune + --enable-deepseek-v4-fp4-indexer + --enforce-piecewise-cuda-graph + --enable-mixed-chunk + --enable-breakable-cuda-graph ) CHUNKED_PREFILL_SIZE=8192 MEM_FRACTION_STATIC=0.90 diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 5622173f1..6329bb9a5 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3531,3 +3531,9 @@ - "The Rust frontend replaces only the Python serving/API layer (HTTP, tokenization, scheduling glue, detokenization) and spawns the same Python EngineCore, so GPU kernels/attention/MoE GEMM/KV cache are untouched" - "A/B sweep (28 single-node points, 1k1k + 8k1k, TP 1/2/4) vs the Python-frontend baseline (run 26696260751): throughput Pareto-neutral (peak tok/s/GPU within <1.5%, frontiers coincident) and TPOT flat (+-0.5%); TTFT improves ~8% at 1k1k and ~22% at 8k1k (every point), the expected signature of lower frontend CPU latency before first token, scaling with input length" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1634 + +- config-keys: + - dsv4-fp4-b300-sglang-mtp + description: + - "Add --enforce-piecewise-cuda-graph, --enable-mixed-chunk, --enable-breakable-cuda-graph to both DP-attn and TP-only launch profiles" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1702