Skip to content
137 changes: 137 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8785,6 +8785,143 @@ dsv4-fp4-gb200-dynamo-sglang:
ep: 12
dp-attn: true

# MTP variant of dsv4-fp4-gb200-dynamo-sglang.
dsv4-fp4-gb200-dynamo-sglang-mtp:
image: lmsysorg/sglang:nightly-dev-cu13-20260528-0abe6a85
model: deepseek-ai/DeepSeek-V4-Pro
model-prefix: dsv4
runner: gb200
precision: fp4
framework: dynamo-sglang
multinode: true
disagg: true
scenarios:
fixed-seq-len:
- isl: 8192
osl: 1024
search-space:
# Low-latency baseline: 1p1d-tp8-tp8. 4 nodes.
- spec-decoding: "mtp"
conc-list: [1]
prefill:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-low-latency-1p1d-tp8-tp8-mtp.yaml"
decode:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
# Low-latency 1p6d-dep8-tp8: 1P (DEP=8) + 6 TP=8 decode workers. 14 nodes.
# Recipe runs concurrencies=32x64x128; matrix tracks the max.
- spec-decoding: "mtp"
conc-list: [128]
prefill:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-low-latency-1p6d-dep8-tp8-mtp.yaml"
decode:
num-worker: 6
tp: 8
ep: 1
dp-attn: false
# Mid curve 1p1d-dep8-dep16. 6 nodes.
- spec-decoding: "mtp"
conc-list: [1024]
prefill:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-mid-curve-1p1d-dep8-dep16-mtp.yaml"
decode:
num-worker: 1
tp: 16
ep: 16
dp-attn: true
# Mid curve 2p1d-dep8-dep16. 8 nodes.
- spec-decoding: "mtp"
conc-list: [2048]
prefill:
num-worker: 2
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-mid-curve-2p1d-dep8-dep16-mtp.yaml"
decode:
num-worker: 1
tp: 16
ep: 16
dp-attn: true
# Mid curve 3p1d-dep8-dep16. 10 nodes.
- spec-decoding: "mtp"
conc-list: [3072]
prefill:
num-worker: 3
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-mid-curve-3p1d-dep8-dep16-mtp.yaml"
decode:
num-worker: 1
tp: 16
ep: 16
dp-attn: true
# Mid curve 4p1d-dep8-dep16. 12 nodes.
- spec-decoding: "mtp"
conc-list: [6144]
prefill:
num-worker: 4
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-mid-curve-4p1d-dep8-dep16-mtp.yaml"
decode:
num-worker: 1
tp: 16
ep: 16
dp-attn: true
# Mid curve 5p1d-dep8-dep16. 14 nodes.
- spec-decoding: "mtp"
conc-list: [8192]
prefill:
num-worker: 5
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-mid-curve-5p1d-dep8-dep16-mtp.yaml"
decode:
num-worker: 1
tp: 16
ep: 16
dp-attn: true
# Mid curve 6p1d-dep8-dep16. 16 nodes.
- spec-decoding: "mtp"
conc-list: [16384]
prefill:
num-worker: 6
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-mid-curve-6p1d-dep8-dep16-mtp.yaml"
decode:
num-worker: 1
tp: 16
ep: 16
dp-attn: true

dsv4-fp4-b300-dynamo-vllm:
image: vllm/vllm-openai:v0.20.1
model: deepseek-ai/DeepSeek-V4-Pro
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
name: "dsv4-pro-gb200-disagg-8k1k-low-latency-1p1d-tp8-tp8-mtp"

frontend:
type: dynamo
enable_multiple_frontends: true
num_additional_frontends: 8

dynamo:
hash: "92f5b3b8d7dd5ab9179d4b1034bd2c1c0803693e"
install: true

model:
path: "deepseek-v4-pro"
container: "lmsysorg/sglang:nightly-dev-cu13-20260528-0abe6a85"
precision: "fp4"

sbatch_directives:
cpus-per-task: "144"
mem: "0"

resources:
gpu_type: "gb200"
gpus_per_node: 4
prefill_nodes: 2
prefill_workers: 1
gpus_per_prefill: 8
decode_nodes: 2
decode_workers: 1
gpus_per_decode: 8

backend:
type: sglang

prefill_environment:
PYTHONUNBUFFERED: "1"
SGLANG_RADIX_FORCE_MISS: "1"
SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
SGLANG_DEFAULT_THINKING: "1"
SGLANG_DSV4_REASONING_EFFORT: "max"
SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_CUMEM_ENABLE: "1"
SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
MC_FORCE_MNNVL: "1"
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1"

decode_environment:
PYTHONUNBUFFERED: "1"
SGLANG_RADIX_FORCE_MISS: "1"
SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
SGLANG_DEFAULT_THINKING: "1"
SGLANG_DSV4_REASONING_EFFORT: "max"
SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_CUMEM_ENABLE: "1"
SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
MC_FORCE_MNNVL: "1"
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1"

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing multinode all-reduce disable

High Severity

The low-latency GB200 MTP recipes run TP8 decode across two nodes per worker but their decode_environment blocks never set SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 to 0. The mid-curve GB200 MTP recipes added in the same change do set that flag, and existing DSV4 slurm recipe notes tie custom all-reduce v2 to incorrect results on multi-node decode.

Additional Locations (1)
Fix in Cursor Fix in Web

Reviewed by Cursor Bugbot for commit 5c201f6. Configure here.

Comment on lines +50 to +64

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🔴 The two new low-latency GB200 recipes (this file and disagg-gb200-low-latency-1p6d-dep8-tp8-mtp.yaml) configure decode at TP=8 on a GPU with gpus_per_node: 4, so each decode worker spans 2 nodes — but their decode_environment does not set SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "0". SGLang's default for CAR_V2 is on, and existing recipes in this directory document that CAR_V2 "is single-node only and corrupts results in 2-node decode setups". The six new GB200 mid-curve recipes in this same PR all correctly set SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "0" — please mirror that in both low-latency decode envs as well.

Extended reasoning...

What the bug is. Both new low-latency GB200 files in this PR — disagg-gb200-low-latency-1p1d-tp8-tp8-mtp.yaml and disagg-gb200-low-latency-1p6d-dep8-tp8-mtp.yaml — declare gpu_type: "gb200", gpus_per_node: 4, and decode tensor-parallel-size: 8. Each decode worker therefore spans 8/4 = 2 nodes. In that regime, every other recipe in this directory explicitly disables SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 in decode_environment, but these two new files do not.\n\nWhy existing code doesn't prevent it. The default for CAR_V2 in SGLang is on. perf-changelog.yaml:3221 (PR #1506) explicitly notes "Remove env vars redundant with sglang defaults (..., SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2)", and the single-node benchmark benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200.sh:29 sets SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 to match. So omitting the var in the YAML means CAR_V2 runs.\n\nDocumented impact. disagg-low-latency-1p1d-tp4-tp4-mtp.yaml carries this comment for the omitted var:\n\n> # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 \n> # is single-node only and corrupts results in 2-node decode setups.\n\nThat comment is fine for the GB300 1p1d-tp4-tp4 / 1p6d-dep4-tp4 files because their decode TP=4 fits a single 4-GPU node. It does not apply to the new GB200 low-latency files, whose decode TP=8 always spans 2 nodes.\n\nInconsistency with the rest of this same PR. All six new GB200 mid-curve recipes added here (disagg-gb200-mid-curve-{1..6}p1d-dep8-dep16-mtp.yaml) set SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "0" in decode_environment (lines ~75 of each). Their decode is TP=16 (4 nodes); the same multi-node constraint applies. The low-latency files are the only outliers in the PR — a clear copy/paste oversight rather than an intentional choice.\n\nStep-by-step proof for disagg-gb200-low-latency-1p1d-tp8-tp8-mtp.yaml.\n1. The file declares gpus_per_node: 4 (line 23) and decode tensor-parallel-size: 8 (line 100, in the decode block of sglang_config).\n2. With 4 GPUs per node and TP=8, the decode worker spans 8/4 = 2 nodes — confirmed by the file's own decode_nodes: 2, decode_workers: 1 (lines 26–28).\n3. The container is lmsysorg/sglang:nightly-dev-cu13-20260528-0abe6a85 whose default for SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 is 1 (on) per the perf-changelog #1506 entry above.\n4. decode_environment (lines 50–64) does not override this var, so CAR_V2 is active on a 2-node decode all-reduce — the exact regime the in-repo comments say "corrupts results."\n5. The same logic applies to disagg-gb200-low-latency-1p6d-dep8-tp8-mtp.yaml, whose decode workers also run TP=8 on gpus_per_node: 4.\n\nFix. Add a single line to the decode_environment of both new GB200 low-latency files (mirroring the mid-curve siblings):\n\nyaml\n SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "0" # CAR_V2 is single-node only.\n\n\nLeaving the bug in place means the low-latency points on the published curve this PR adds will be silently incorrect rather than crashing, which is the worst failure mode for a benchmark recipe.


sglang_config:
prefill:
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
model-path: "/model/"
trust-remote-code: true
tool-call-parser: deepseekv4

disaggregation-mode: "prefill"
disaggregation-transfer-backend: mooncake

tensor-parallel-size: 8
data-parallel-size: 1
expert-parallel-size: 1

moe-runner-backend: "flashinfer_mxfp4"
disable-flashinfer-autotune: true

mem-fraction-static: 0.9
max-running-requests: 16
cuda-graph-max-bs: 8
chunked-prefill-size: 65536

decode:
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
model-path: "/model/"
trust-remote-code: true
tool-call-parser: deepseekv4

disaggregation-mode: "decode"
disaggregation-transfer-backend: mooncake

tensor-parallel-size: 8
data-parallel-size: 1
expert-parallel-size: 1

moe-runner-backend: "flashinfer_mxfp4"
disable-flashinfer-autotune: true

speculative-algo: "EAGLE"
speculative-num-steps: 3
speculative-eagle-topk: 1
speculative-num-draft-tokens: 4

mem-fraction-static: 0.9
max-running-requests: 8
cuda-graph-max-bs: 8
swa-full-tokens-ratio: 0.1
context-length: 16384

benchmark:
type: "sa-bench"
isl: 8192
osl: 1024
random_range_ratio: 0.8
concurrencies: "1"
req_rate: "inf"
use_chat_template: true
custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer"

Loading
Loading