Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
169 changes: 169 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9243,6 +9243,175 @@ dsv4-fp4-gb300-dynamo-sglang-mtp:
ep: 8
dp-attn: true

dsv4-fp4-gb300-dynamo-sglang-mtp-1k1k:
image: lmsysorg/sglang:nightly-dev-cu13-20260603-83bc7766
model: deepseek-ai/DeepSeek-V4-Pro
model-prefix: dsv4
runner: gb300
precision: fp4
framework: dynamo-sglang
multinode: true
disagg: true
scenarios:
fixed-seq-len:
- isl: 1024
osl: 1024
search-space:
- spec-decoding: "mtp"
conc-list: [8192]
prefill:
num-worker: 1
tp: 4
ep: 4
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-1p1d-dep8-conc8192-mtp.yaml"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
- spec-decoding: "mtp"
conc-list: [8192]
prefill:
num-worker: 1
tp: 4
ep: 4
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-1p1d-dep16-conc8192-mtp.yaml"
decode:
num-worker: 1
tp: 16
ep: 16
dp-attn: true
- spec-decoding: "mtp"
conc-list: [8192]
prefill:
num-worker: 2
tp: 4
ep: 4
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-2p1d-dep16-conc8192-mtp.yaml"
decode:
num-worker: 1
tp: 16
ep: 16
dp-attn: true
- spec-decoding: "mtp"
conc-list: [64]
prefill:
num-worker: 1
tp: 4
ep: 4
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p1d-dep4-tp4-mtp.yaml"
decode:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
- spec-decoding: "mtp"
conc-list: [64]
prefill:
num-worker: 1
tp: 4
ep: 4
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p2d-dep4-tp4-mtp.yaml"
decode:
num-worker: 2
tp: 4
ep: 1
dp-attn: false
- spec-decoding: "mtp"
conc-list: [64]
prefill:
num-worker: 1
tp: 4
ep: 4
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p4d-dep4-tp4-mtp.yaml"
decode:
num-worker: 4
tp: 4
ep: 1
dp-attn: false
- spec-decoding: "mtp"
conc-list: [64]
prefill:
num-worker: 1
tp: 4
ep: 4
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml"
decode:
num-worker: 6
tp: 4
ep: 1
dp-attn: false
- spec-decoding: "mtp"
conc-list: [2]
prefill:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
additional-settings:
- "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml"
decode:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
- spec-decoding: "mtp"
conc-list: [2]
prefill:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
additional-settings:
- "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p2d-tp4-tp4-mtp.yaml"
decode:
num-worker: 2
tp: 4
ep: 1
dp-attn: false
- spec-decoding: "mtp"
conc-list: [2]
prefill:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
additional-settings:
- "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p4d-tp4-tp4-mtp.yaml"
decode:
num-worker: 4
tp: 4
ep: 1
dp-attn: false
- spec-decoding: "mtp"
conc-list: [2]
prefill:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
additional-settings:
- "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p6d-tp4-tp4-mtp.yaml"
decode:
num-worker: 6
tp: 4
ep: 1
dp-attn: false


kimik2.5-int4-h100-vllm:
image: vllm/vllm-openai:v0.20.2
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
name: "dsv4-pro-gb300-disagg-1k1k-mtp-1p1d-dep16-conc8192"

slurm:
time_limit: "03:00:00"

sbatch_directives:
cpus-per-task: '144'
mem: '0'

dynamo:
hash: "34d55a596fb8d3d44daefe425ec1e303131f4d2c"
install: true

frontend:
type: dynamo
enable_multiple_frontends: true
num_additional_frontends: 8

model:
path: dsv4-pro
container: "lmsysorg/sglang:nightly-dev-cu13-20260603-83bc7766"
precision: fp4

resources:
gpu_type: gb300
gpus_per_node: 4
prefill_nodes: 1
prefill_workers: 1
gpus_per_prefill: 4
decode_nodes: 4
decode_workers: 1
gpus_per_decode: 16

backend:
type: sglang

prefill_environment:
PYTHONUNBUFFERED: '1'
SGLANG_RADIX_DISABLE_REUSE: '1'
SGLANG_JIT_DEEPGEMM_FAST_WARMUP: '1'
SGLANG_DEFAULT_THINKING: '1'
SGLANG_DSV4_REASONING_EFFORT: max
SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: '1'
SGLANG_OPT_USE_JIT_NORM: '1'
SGLANG_OPT_USE_JIT_INDEXER_METADATA: '1'
SGLANG_OPT_USE_TOPK_V2: '1'

SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: '1'
SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: '1'
SGLANG_OPT_USE_FAST_MASK_EP: '1'
SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: '1'
SGLANG_OPT_FIX_HASH_MEGA_MOE: '1'
SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: '9216'
SGLANG_OPT_FIX_MEGA_MOE_MEMORY: '1'
SGLANG_OPT_FIX_NEXTN_MEGA_MOE: '1'
SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '0'

NCCL_MNNVL_ENABLE: '1'
NCCL_CUMEM_ENABLE: '1'
SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
MC_FORCE_MNNVL: '1'
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: '1'

decode_environment:
PYTHONUNBUFFERED: '1'
SGLANG_RADIX_DISABLE_REUSE: '1'
SGLANG_JIT_DEEPGEMM_FAST_WARMUP: '1'
SGLANG_DEFAULT_THINKING: '1'
SGLANG_DSV4_REASONING_EFFORT: max
SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: '1'
SGLANG_OPT_USE_JIT_NORM: '1'
SGLANG_OPT_USE_JIT_INDEXER_METADATA: '1'
SGLANG_OPT_USE_TOPK_V2: '1'

SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: '1'
SGLANG_OPT_USE_FAST_MASK_EP: '1'
SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: '1'
SGLANG_OPT_FIX_HASH_MEGA_MOE: '1'
SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: '4096'
SGLANG_OPT_FIX_MEGA_MOE_MEMORY: '1'
SGLANG_OPT_FIX_NEXTN_MEGA_MOE: '1'
SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '0'

NCCL_MNNVL_ENABLE: '1'
NCCL_CUMEM_ENABLE: '1'
SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
MC_FORCE_MNNVL: '1'
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: '1'
SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "0" # CAR_V2 is single-node only.

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing distributed timeout dep16

Medium Severity

disagg-1p1d-dep8-conc8192-mtp.yaml sets TORCH_DISTRIBUTED_DEFAULT_TIMEOUT to 1800 in decode_environment for multi-node decode, but the new dep16 recipes (four decode nodes, TP16) omit it. Longer multi-node decode init can hit the default shorter timeout.

Additional Locations (1)
Fix in Cursor Fix in Web

Reviewed by Cursor Bugbot for commit 37100c4. Configure here.


sglang_config:
prefill:
served-model-name: deepseek-ai/DeepSeek-V4-Pro
model-path: "/model/"
trust-remote-code: true
tool-call-parser: deepseekv4

disaggregation-mode: prefill
disaggregation-transfer-backend: mooncake

tensor-parallel-size: 4
data-parallel-size: 4
expert-parallel-size: 4

enable-dp-attention: true
enable-dp-lm-head: true

moe-a2a-backend: deepep
deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'

mem-fraction-static: 0.9
max-running-requests: 1024
cuda-graph-max-bs: 1024
chunked-prefill-size: 32768
stream-interval: 60

decode:
served-model-name: deepseek-ai/DeepSeek-V4-Pro
model-path: "/model/"
trust-remote-code: true
tool-call-parser: deepseekv4

disaggregation-mode: decode
disaggregation-transfer-backend: mooncake

tensor-parallel-size: 16
data-parallel-size: 16
expert-parallel-size: 16

enable-dp-attention: true
enable-dp-lm-head: true

moe-a2a-backend: deepep
deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'

speculative-algo: "EAGLE"
speculative-num-steps: 3
speculative-eagle-topk: 1
speculative-num-draft-tokens: 4

mem-fraction-static: 0.9
max-running-requests: 18432
cuda-graph-max-bs: 1024
swa-full-tokens-ratio: 0.15
context-length: 16384
stream-interval: 60

benchmark:
type: "sa-bench"
isl: 1024
osl: 1024
random_range_ratio: 0.8
concurrencies: "8192"
req_rate: "inf"
use_chat_template: true
custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer"
Loading
Loading