diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 957dd0934..d1ad86d80 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -9243,6 +9243,175 @@ dsv4-fp4-gb300-dynamo-sglang-mtp: ep: 8 dp-attn: true +dsv4-fp4-gb300-dynamo-sglang-mtp-1k1k: + image: lmsysorg/sglang:nightly-dev-cu13-20260603-83bc7766 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: gb300 + precision: fp4 + framework: dynamo-sglang + multinode: true + disagg: true + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - spec-decoding: "mtp" + conc-list: [8192] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-1p1d-dep8-conc8192-mtp.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [8192] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-1p1d-dep16-conc8192-mtp.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [8192] + prefill: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-2p1d-dep16-conc8192-mtp.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [64] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p1d-dep4-tp4-mtp.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [64] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p2d-dep4-tp4-mtp.yaml" + decode: + num-worker: 2 + tp: 4 + ep: 1 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [64] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p4d-dep4-tp4-mtp.yaml" + decode: + num-worker: 4 + tp: 4 + ep: 1 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [64] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml" + decode: + num-worker: 6 + tp: 4 + ep: 1 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [2] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [2] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p2d-tp4-tp4-mtp.yaml" + decode: + num-worker: 2 + tp: 4 + ep: 1 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [2] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p4d-tp4-tp4-mtp.yaml" + decode: + num-worker: 4 + tp: 4 + ep: 1 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [2] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p6d-tp4-tp4-mtp.yaml" + decode: + num-worker: 6 + tp: 4 + ep: 1 + dp-attn: false + kimik2.5-int4-h100-vllm: image: vllm/vllm-openai:v0.20.2 diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p1d-dep16-conc8192-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p1d-dep16-conc8192-mtp.yaml new file mode 100644 index 000000000..80a9d1455 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p1d-dep16-conc8192-mtp.yaml @@ -0,0 +1,160 @@ +name: "dsv4-pro-gb300-disagg-1k1k-mtp-1p1d-dep16-conc8192" + +slurm: + time_limit: "03:00:00" + +sbatch_directives: + cpus-per-task: '144' + mem: '0' + +dynamo: + hash: "34d55a596fb8d3d44daefe425ec1e303131f4d2c" + install: true + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 8 + +model: + path: dsv4-pro + container: "lmsysorg/sglang:nightly-dev-cu13-20260603-83bc7766" + precision: fp4 + +resources: + gpu_type: gb300 + gpus_per_node: 4 + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + decode_nodes: 4 + decode_workers: 1 + gpus_per_decode: 16 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: '1' + SGLANG_RADIX_DISABLE_REUSE: '1' + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: '1' + SGLANG_DEFAULT_THINKING: '1' + SGLANG_DSV4_REASONING_EFFORT: max + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: '1' + SGLANG_OPT_USE_JIT_NORM: '1' + SGLANG_OPT_USE_JIT_INDEXER_METADATA: '1' + SGLANG_OPT_USE_TOPK_V2: '1' + + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: '1' + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: '1' + SGLANG_OPT_USE_FAST_MASK_EP: '1' + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: '1' + SGLANG_OPT_FIX_HASH_MEGA_MOE: '1' + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: '9216' + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: '1' + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: '1' + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '0' + + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + MC_FORCE_MNNVL: '1' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: '1' + + decode_environment: + PYTHONUNBUFFERED: '1' + SGLANG_RADIX_DISABLE_REUSE: '1' + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: '1' + SGLANG_DEFAULT_THINKING: '1' + SGLANG_DSV4_REASONING_EFFORT: max + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: '1' + SGLANG_OPT_USE_JIT_NORM: '1' + SGLANG_OPT_USE_JIT_INDEXER_METADATA: '1' + SGLANG_OPT_USE_TOPK_V2: '1' + + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: '1' + SGLANG_OPT_USE_FAST_MASK_EP: '1' + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: '1' + SGLANG_OPT_FIX_HASH_MEGA_MOE: '1' + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: '4096' + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: '1' + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: '1' + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '0' + + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + MC_FORCE_MNNVL: '1' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: '1' + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "0" # CAR_V2 is single-node only. + + sglang_config: + prefill: + served-model-name: deepseek-ai/DeepSeek-V4-Pro + model-path: "/model/" + trust-remote-code: true + tool-call-parser: deepseekv4 + + disaggregation-mode: prefill + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: deepep + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + mem-fraction-static: 0.9 + max-running-requests: 1024 + cuda-graph-max-bs: 1024 + chunked-prefill-size: 32768 + stream-interval: 60 + + decode: + served-model-name: deepseek-ai/DeepSeek-V4-Pro + model-path: "/model/" + trust-remote-code: true + tool-call-parser: deepseekv4 + + disaggregation-mode: decode + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 16 + data-parallel-size: 16 + expert-parallel-size: 16 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: deepep + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + speculative-algo: "EAGLE" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + + mem-fraction-static: 0.9 + max-running-requests: 18432 + cuda-graph-max-bs: 1024 + swa-full-tokens-ratio: 0.15 + context-length: 16384 + stream-interval: 60 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "8192" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p1d-dep8-conc8192-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p1d-dep8-conc8192-mtp.yaml new file mode 100644 index 000000000..133beff1c --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p1d-dep8-conc8192-mtp.yaml @@ -0,0 +1,161 @@ +name: "dsv4-pro-gb300-disagg-1k1k-mtp-1p1d-dep8-conc8192" + +slurm: + time_limit: "03:00:00" + +sbatch_directives: + cpus-per-task: '144' + mem: '0' + +dynamo: + hash: "34d55a596fb8d3d44daefe425ec1e303131f4d2c" + install: true + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 8 + +model: + path: dsv4-pro + container: "lmsysorg/sglang:nightly-dev-cu13-20260603-83bc7766" + precision: fp4 + +resources: + gpu_type: gb300 + gpus_per_node: 4 + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + decode_nodes: 2 + decode_workers: 1 + gpus_per_decode: 8 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: '1' + SGLANG_RADIX_DISABLE_REUSE: '1' + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: '1' + SGLANG_DEFAULT_THINKING: '1' + SGLANG_DSV4_REASONING_EFFORT: max + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: '1' + SGLANG_OPT_USE_JIT_NORM: '1' + SGLANG_OPT_USE_JIT_INDEXER_METADATA: '1' + SGLANG_OPT_USE_TOPK_V2: '1' + + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: '1' + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: '1' + SGLANG_OPT_USE_FAST_MASK_EP: '1' + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: '1' + SGLANG_OPT_FIX_HASH_MEGA_MOE: '1' + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: '9216' + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: '1' + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: '1' + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '0' + + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + MC_FORCE_MNNVL: '1' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: '1' + + decode_environment: + PYTHONUNBUFFERED: '1' + SGLANG_RADIX_DISABLE_REUSE: '1' + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: '1' + SGLANG_DEFAULT_THINKING: '1' + SGLANG_DSV4_REASONING_EFFORT: max + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: '1' + SGLANG_OPT_USE_JIT_NORM: '1' + SGLANG_OPT_USE_JIT_INDEXER_METADATA: '1' + SGLANG_OPT_USE_TOPK_V2: '1' + + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: '1' + SGLANG_OPT_USE_FAST_MASK_EP: '1' + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: '1' + SGLANG_OPT_FIX_HASH_MEGA_MOE: '1' + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: '2048' + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: '1' + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: '1' + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '0' + + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + MC_FORCE_MNNVL: '1' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: '1' + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "0" # CAR_V2 is single-node only. + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + + sglang_config: + prefill: + served-model-name: deepseek-ai/DeepSeek-V4-Pro + model-path: "/model/" + trust-remote-code: true + tool-call-parser: deepseekv4 + + disaggregation-mode: prefill + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: deepep + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + mem-fraction-static: 0.9 + max-running-requests: 256 + cuda-graph-max-bs: 256 + chunked-prefill-size: 32768 + stream-interval: 60 + + decode: + served-model-name: deepseek-ai/DeepSeek-V4-Pro + model-path: "/model/" + trust-remote-code: true + tool-call-parser: deepseekv4 + + disaggregation-mode: decode + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 8 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: deepep + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + speculative-algo: "EAGLE" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + + mem-fraction-static: 0.94 + max-running-requests: 1536 + cuda-graph-max-bs: 256 + swa-full-tokens-ratio: 0.15 + context-length: 16384 + stream-interval: 60 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "8192" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-2p1d-dep16-conc8192-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-2p1d-dep16-conc8192-mtp.yaml new file mode 100644 index 000000000..5176c28f2 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-2p1d-dep16-conc8192-mtp.yaml @@ -0,0 +1,160 @@ +name: "dsv4-pro-gb300-disagg-1k1k-mtp-2p1d-dep16-conc8192" + +slurm: + time_limit: "03:00:00" + +sbatch_directives: + cpus-per-task: '144' + mem: '0' + +dynamo: + hash: "34d55a596fb8d3d44daefe425ec1e303131f4d2c" + install: true + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 8 + +model: + path: dsv4-pro + container: "lmsysorg/sglang:nightly-dev-cu13-20260603-83bc7766" + precision: fp4 + +resources: + gpu_type: gb300 + gpus_per_node: 4 + prefill_nodes: 2 + prefill_workers: 2 + gpus_per_prefill: 4 + decode_nodes: 4 + decode_workers: 1 + gpus_per_decode: 16 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: '1' + SGLANG_RADIX_DISABLE_REUSE: '1' + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: '1' + SGLANG_DEFAULT_THINKING: '1' + SGLANG_DSV4_REASONING_EFFORT: max + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: '1' + SGLANG_OPT_USE_JIT_NORM: '1' + SGLANG_OPT_USE_JIT_INDEXER_METADATA: '1' + SGLANG_OPT_USE_TOPK_V2: '1' + + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: '1' + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: '1' + SGLANG_OPT_USE_FAST_MASK_EP: '1' + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: '1' + SGLANG_OPT_FIX_HASH_MEGA_MOE: '1' + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: '9216' + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: '1' + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: '1' + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '0' + + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + MC_FORCE_MNNVL: '1' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: '1' + + decode_environment: + PYTHONUNBUFFERED: '1' + SGLANG_RADIX_DISABLE_REUSE: '1' + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: '1' + SGLANG_DEFAULT_THINKING: '1' + SGLANG_DSV4_REASONING_EFFORT: max + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: '1' + SGLANG_OPT_USE_JIT_NORM: '1' + SGLANG_OPT_USE_JIT_INDEXER_METADATA: '1' + SGLANG_OPT_USE_TOPK_V2: '1' + + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: '1' + SGLANG_OPT_USE_FAST_MASK_EP: '1' + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: '1' + SGLANG_OPT_FIX_HASH_MEGA_MOE: '1' + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: '4096' + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: '1' + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: '1' + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '0' + + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + MC_FORCE_MNNVL: '1' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: '1' + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "0" # CAR_V2 is single-node only. + + sglang_config: + prefill: + served-model-name: deepseek-ai/DeepSeek-V4-Pro + model-path: "/model/" + trust-remote-code: true + tool-call-parser: deepseekv4 + + disaggregation-mode: prefill + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: deepep + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + mem-fraction-static: 0.9 + max-running-requests: 1024 + cuda-graph-max-bs: 1024 + chunked-prefill-size: 32768 + stream-interval: 60 + + decode: + served-model-name: deepseek-ai/DeepSeek-V4-Pro + model-path: "/model/" + trust-remote-code: true + tool-call-parser: deepseekv4 + + disaggregation-mode: decode + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 16 + data-parallel-size: 16 + expert-parallel-size: 16 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: deepep + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + speculative-algo: "EAGLE" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + + mem-fraction-static: 0.9 + max-running-requests: 18432 + cuda-graph-max-bs: 1024 + swa-full-tokens-ratio: 0.15 + context-length: 16384 + stream-interval: 60 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "8192" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p1d-dep4-tp4-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p1d-dep4-tp4-mtp.yaml new file mode 100644 index 000000000..92a7d98a5 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p1d-dep4-tp4-mtp.yaml @@ -0,0 +1,134 @@ +name: "dsv4-pro-gb300-disagg-1k1k-mtp-low-latency-1p1d-dep4-tp4" + +frontend: + type: sglang + enable_multiple_frontends: false + args: + policy: "cache_aware" + +model: + path: "dsv4-pro" + container: "lmsysorg/sglang:nightly-dev-cu13-20260510-2473659e" + precision: "mxfp4" + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 1 + decode_workers: 1 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 + # is single-node only and corrupts results in 2-node decode setups. + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + mem-fraction-static: 0.90 + max-running-requests: 512 + cuda-graph-max-bs: 512 + chunked-prefill-size: 4096 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + + speculative-algo: "EAGLE" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + + mem-fraction-static: 0.9 + max-running-requests: 64 + cuda-graph-max-bs: 64 + swa-full-tokens-ratio: 0.1 + context-length: 4096 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "1x2x4x8x16x32x64" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml new file mode 100644 index 000000000..97935ebec --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml @@ -0,0 +1,120 @@ +name: "dsv4-pro-gb300-disagg-1k1k-mtp-low-latency-1p1d-tp4-tp4" + +frontend: + type: sglang + enable_multiple_frontends: false + args: + policy: "cache_aware" + +model: + path: "dsv4-pro" + container: "lmsysorg/sglang:nightly-dev-cu13-20260510-2473659e" + precision: "mxfp4" + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 1 + decode_workers: 1 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 + # is single-node only and corrupts results in 2-node decode setups. + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + + mem-fraction-static: 0.90 + max-running-requests: 8 + cuda-graph-max-bs: 8 + chunked-prefill-size: 4096 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + + speculative-algo: "EAGLE" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + + mem-fraction-static: 0.9 + max-running-requests: 8 + cuda-graph-max-bs: 8 + swa-full-tokens-ratio: 0.1 + context-length: 4096 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "1x2" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p2d-dep4-tp4-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p2d-dep4-tp4-mtp.yaml new file mode 100644 index 000000000..fe154f889 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p2d-dep4-tp4-mtp.yaml @@ -0,0 +1,134 @@ +name: "dsv4-pro-gb300-disagg-1k1k-mtp-low-latency-1p2d-dep4-tp4" + +frontend: + type: sglang + enable_multiple_frontends: false + args: + policy: "cache_aware" + +model: + path: "dsv4-pro" + container: "lmsysorg/sglang:nightly-dev-cu13-20260510-2473659e" + precision: "mxfp4" + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 2 + decode_workers: 2 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 + # is single-node only and corrupts results in 2-node decode setups. + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + mem-fraction-static: 0.90 + max-running-requests: 512 + cuda-graph-max-bs: 512 + chunked-prefill-size: 4096 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + + speculative-algo: "EAGLE" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + + mem-fraction-static: 0.9 + max-running-requests: 64 + cuda-graph-max-bs: 64 + swa-full-tokens-ratio: 0.1 + context-length: 4096 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "1x2x4x8x16x32x64" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p2d-tp4-tp4-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p2d-tp4-tp4-mtp.yaml new file mode 100644 index 000000000..454c5cd09 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p2d-tp4-tp4-mtp.yaml @@ -0,0 +1,120 @@ +name: "dsv4-pro-gb300-disagg-1k1k-mtp-low-latency-1p2d-tp4-tp4" + +frontend: + type: sglang + enable_multiple_frontends: false + args: + policy: "cache_aware" + +model: + path: "dsv4-pro" + container: "lmsysorg/sglang:nightly-dev-cu13-20260510-2473659e" + precision: "mxfp4" + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 2 + decode_workers: 2 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 + # is single-node only and corrupts results in 2-node decode setups. + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + + mem-fraction-static: 0.90 + max-running-requests: 8 + cuda-graph-max-bs: 8 + chunked-prefill-size: 4096 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + + speculative-algo: "EAGLE" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + + mem-fraction-static: 0.9 + max-running-requests: 8 + cuda-graph-max-bs: 8 + swa-full-tokens-ratio: 0.1 + context-length: 4096 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "1x2" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p4d-dep4-tp4-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p4d-dep4-tp4-mtp.yaml new file mode 100644 index 000000000..148d27707 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p4d-dep4-tp4-mtp.yaml @@ -0,0 +1,134 @@ +name: "dsv4-pro-gb300-disagg-1k1k-mtp-low-latency-1p4d-dep4-tp4" + +frontend: + type: sglang + enable_multiple_frontends: false + args: + policy: "cache_aware" + +model: + path: "dsv4-pro" + container: "lmsysorg/sglang:nightly-dev-cu13-20260510-2473659e" + precision: "mxfp4" + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 4 + decode_workers: 4 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 + # is single-node only and corrupts results in 2-node decode setups. + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + mem-fraction-static: 0.90 + max-running-requests: 512 + cuda-graph-max-bs: 512 + chunked-prefill-size: 4096 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + + speculative-algo: "EAGLE" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + + mem-fraction-static: 0.9 + max-running-requests: 64 + cuda-graph-max-bs: 64 + swa-full-tokens-ratio: 0.1 + context-length: 4096 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "1x2x4x8x16x32x64" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p4d-tp4-tp4-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p4d-tp4-tp4-mtp.yaml new file mode 100644 index 000000000..6b77bf113 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p4d-tp4-tp4-mtp.yaml @@ -0,0 +1,120 @@ +name: "dsv4-pro-gb300-disagg-1k1k-mtp-low-latency-1p4d-tp4-tp4" + +frontend: + type: sglang + enable_multiple_frontends: false + args: + policy: "cache_aware" + +model: + path: "dsv4-pro" + container: "lmsysorg/sglang:nightly-dev-cu13-20260510-2473659e" + precision: "mxfp4" + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 4 + decode_workers: 4 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 + # is single-node only and corrupts results in 2-node decode setups. + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + + mem-fraction-static: 0.90 + max-running-requests: 8 + cuda-graph-max-bs: 8 + chunked-prefill-size: 4096 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + + speculative-algo: "EAGLE" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + + mem-fraction-static: 0.9 + max-running-requests: 8 + cuda-graph-max-bs: 8 + swa-full-tokens-ratio: 0.1 + context-length: 4096 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "1x2" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml new file mode 100644 index 000000000..9e7da0458 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml @@ -0,0 +1,134 @@ +name: "dsv4-pro-gb300-disagg-1k1k-mtp-low-latency-1p6d-dep4-tp4" + +frontend: + type: sglang + enable_multiple_frontends: false + args: + policy: "cache_aware" + +model: + path: "dsv4-pro" + container: "lmsysorg/sglang:nightly-dev-cu13-20260510-2473659e" + precision: "mxfp4" + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 6 + decode_workers: 6 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 + # is single-node only and corrupts results in 2-node decode setups. + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + mem-fraction-static: 0.90 + max-running-requests: 512 + cuda-graph-max-bs: 512 + chunked-prefill-size: 4096 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + + speculative-algo: "EAGLE" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + + mem-fraction-static: 0.9 + max-running-requests: 64 + cuda-graph-max-bs: 64 + swa-full-tokens-ratio: 0.1 + context-length: 4096 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "1x2x4x8x16x32x64" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p6d-tp4-tp4-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p6d-tp4-tp4-mtp.yaml new file mode 100644 index 000000000..cb657a956 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p6d-tp4-tp4-mtp.yaml @@ -0,0 +1,120 @@ +name: "dsv4-pro-gb300-disagg-1k1k-mtp-low-latency-1p6d-tp4-tp4" + +frontend: + type: sglang + enable_multiple_frontends: false + args: + policy: "cache_aware" + +model: + path: "dsv4-pro" + container: "lmsysorg/sglang:nightly-dev-cu13-20260510-2473659e" + precision: "mxfp4" + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 6 + decode_workers: 6 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 + # is single-node only and corrupts results in 2-node decode setups. + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + + mem-fraction-static: 0.90 + max-running-requests: 8 + cuda-graph-max-bs: 8 + chunked-prefill-size: 4096 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + + speculative-algo: "EAGLE" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + + mem-fraction-static: 0.9 + max-running-requests: 8 + cuda-graph-max-bs: 8 + swa-full-tokens-ratio: 0.1 + context-length: 4096 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "1x2" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 72764bcc8..ba26f8d57 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3579,3 +3579,13 @@ - "MI355x DSR1-FP4: Include TP4 configurations for 8k1k" - "Expand the TP sweep (included TP=4) for 8k/1k configuration for conc=4 to 64" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1692 + +- config-keys: + - dsv4-fp4-gb300-dynamo-sglang-mtp-1k1k + description: + - "Add DeepSeek-V4-Pro FP4 GB300 disaggregated SGLang MTP 1k/1k coverage using lmsysorg/sglang:nightly-dev-cu13-20260603-83bc7766" + - "New top-level config (sibling of dsv4-fp4-gb300-dynamo-sglang-mtp which stays on the older 8k/1k image nightly-dev-20260527-14f81a67); separate entry is required because the launcher builds one squashfs per top-level image and the 1k/1k recipes pin a newer container string" + - "Wires 11 disagg recipes adapted from NVIDIA/srt-slurm PR #177 (recipes/dsv4-pro/sglang/gb300-fp4/1k1k/disagg/mtp/), staged locally under benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/: 3 high-throughput conc=8192 recipes (1p1d-dep8, 1p1d-dep16, 2p1d-dep16), 4 flat low-latency dep4 recipes (1p1d/1p2d/1p4d/1p6d-dep4-tp4 at conc=64), and 4 flat low-latency tp4 recipes (1p1d/1p2d/1p4d/1p6d-tp4-tp4 at conc=2)" + - "Flat dep4 and tp4 splits replace the upstream disagg-low-latency-dep4-mtp.yaml and disagg-low-latency-tp4-mtp.yaml templates (each base + zip_override_1k1k_lowlat with decode_nodes [1,2,4,6]). The templated form silently broke under the gb300-cw launcher: srtctl fans the single recipe into 4 Slurm submissions per `srtctl apply`, but launch_gb300-cw.sh:282 captures the job IDs with `grep -oP '✅ Job \\K[0-9]+'` into a scalar JOB_ID, jamming 4 newline-separated IDs into one variable. Downstream uses (LOGS_DIR path interpolation at :293-294, squeue polls at :313-314, error message at :315) all corrupt, producing the observed `ERROR: Job 7079\\n7082 failed before creating log file` and orphaning the real Slurm jobs. Single-config files match the 8k1k convention (one recipe per decode-worker count) and force srtctl to emit exactly 1 job per launcher invocation" + - "Decode side runs EAGLE (num-steps=3, eagle-topk=1, num-draft-tokens=4) on all 11 recipes; recipe container strings are kept upstream-faithful (high-tput recipes pin nightly-dev-cu13-20260603-83bc7766, low-latency recipes pin nightly-dev-cu13-20260510-2473659e)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1697