From 2aeafb4d314af6e3f7f27d0c4a0f9c736ba56524 Mon Sep 17 00:00:00 2001 From: hshrivastava-droid Date: Tue, 9 Jun 2026 14:23:00 -0700 Subject: [PATCH 1/6] add dsv4 1k1k --- .github/configs/nvidia-master.yaml | 85 +++++++++ .../1k1k/disagg-1p1d-dep16-conc8192-mtp.yaml | 160 +++++++++++++++++ .../1k1k/disagg-1p1d-dep8-conc8192-mtp.yaml | 161 ++++++++++++++++++ .../1k1k/disagg-2p1d-dep16-conc8192-mtp.yaml | 160 +++++++++++++++++ .../1k1k/disagg-low-latency-dep4-mtp.yaml | 148 ++++++++++++++++ .../1k1k/disagg-low-latency-tp4-mtp.yaml | 134 +++++++++++++++ perf-changelog.yaml | 11 ++ 7 files changed, 859 insertions(+) create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p1d-dep16-conc8192-mtp.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p1d-dep8-conc8192-mtp.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-2p1d-dep16-conc8192-mtp.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-dep4-mtp.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-tp4-mtp.yaml diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index a02749d4d..54e875833 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -9217,6 +9217,91 @@ dsv4-fp4-gb300-dynamo-sglang-mtp: ep: 8 dp-attn: true +dsv4-fp4-gb300-dynamo-sglang-mtp-1k1k: + image: lmsysorg/sglang:nightly-dev-cu13-20260603-83bc7766 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: gb300-cw + precision: fp4 + framework: dynamo-sglang + multinode: true + disagg: true + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - spec-decoding: "mtp" + conc-list: [8192] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-1p1d-dep8-conc8192-mtp.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [8192] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-1p1d-dep16-conc8192-mtp.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [8192] + prefill: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-2p1d-dep16-conc8192-mtp.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [64] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-dep4-mtp.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [2] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-tp4-mtp.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + kimik2.5-int4-h100-vllm: image: vllm/vllm-openai:v0.20.2 diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p1d-dep16-conc8192-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p1d-dep16-conc8192-mtp.yaml new file mode 100644 index 000000000..80a9d1455 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p1d-dep16-conc8192-mtp.yaml @@ -0,0 +1,160 @@ +name: "dsv4-pro-gb300-disagg-1k1k-mtp-1p1d-dep16-conc8192" + +slurm: + time_limit: "03:00:00" + +sbatch_directives: + cpus-per-task: '144' + mem: '0' + +dynamo: + hash: "34d55a596fb8d3d44daefe425ec1e303131f4d2c" + install: true + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 8 + +model: + path: dsv4-pro + container: "lmsysorg/sglang:nightly-dev-cu13-20260603-83bc7766" + precision: fp4 + +resources: + gpu_type: gb300 + gpus_per_node: 4 + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + decode_nodes: 4 + decode_workers: 1 + gpus_per_decode: 16 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: '1' + SGLANG_RADIX_DISABLE_REUSE: '1' + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: '1' + SGLANG_DEFAULT_THINKING: '1' + SGLANG_DSV4_REASONING_EFFORT: max + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: '1' + SGLANG_OPT_USE_JIT_NORM: '1' + SGLANG_OPT_USE_JIT_INDEXER_METADATA: '1' + SGLANG_OPT_USE_TOPK_V2: '1' + + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: '1' + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: '1' + SGLANG_OPT_USE_FAST_MASK_EP: '1' + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: '1' + SGLANG_OPT_FIX_HASH_MEGA_MOE: '1' + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: '9216' + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: '1' + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: '1' + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '0' + + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + MC_FORCE_MNNVL: '1' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: '1' + + decode_environment: + PYTHONUNBUFFERED: '1' + SGLANG_RADIX_DISABLE_REUSE: '1' + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: '1' + SGLANG_DEFAULT_THINKING: '1' + SGLANG_DSV4_REASONING_EFFORT: max + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: '1' + SGLANG_OPT_USE_JIT_NORM: '1' + SGLANG_OPT_USE_JIT_INDEXER_METADATA: '1' + SGLANG_OPT_USE_TOPK_V2: '1' + + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: '1' + SGLANG_OPT_USE_FAST_MASK_EP: '1' + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: '1' + SGLANG_OPT_FIX_HASH_MEGA_MOE: '1' + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: '4096' + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: '1' + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: '1' + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '0' + + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + MC_FORCE_MNNVL: '1' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: '1' + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "0" # CAR_V2 is single-node only. + + sglang_config: + prefill: + served-model-name: deepseek-ai/DeepSeek-V4-Pro + model-path: "/model/" + trust-remote-code: true + tool-call-parser: deepseekv4 + + disaggregation-mode: prefill + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: deepep + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + mem-fraction-static: 0.9 + max-running-requests: 1024 + cuda-graph-max-bs: 1024 + chunked-prefill-size: 32768 + stream-interval: 60 + + decode: + served-model-name: deepseek-ai/DeepSeek-V4-Pro + model-path: "/model/" + trust-remote-code: true + tool-call-parser: deepseekv4 + + disaggregation-mode: decode + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 16 + data-parallel-size: 16 + expert-parallel-size: 16 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: deepep + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + speculative-algo: "EAGLE" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + + mem-fraction-static: 0.9 + max-running-requests: 18432 + cuda-graph-max-bs: 1024 + swa-full-tokens-ratio: 0.15 + context-length: 16384 + stream-interval: 60 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "8192" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p1d-dep8-conc8192-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p1d-dep8-conc8192-mtp.yaml new file mode 100644 index 000000000..133beff1c --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p1d-dep8-conc8192-mtp.yaml @@ -0,0 +1,161 @@ +name: "dsv4-pro-gb300-disagg-1k1k-mtp-1p1d-dep8-conc8192" + +slurm: + time_limit: "03:00:00" + +sbatch_directives: + cpus-per-task: '144' + mem: '0' + +dynamo: + hash: "34d55a596fb8d3d44daefe425ec1e303131f4d2c" + install: true + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 8 + +model: + path: dsv4-pro + container: "lmsysorg/sglang:nightly-dev-cu13-20260603-83bc7766" + precision: fp4 + +resources: + gpu_type: gb300 + gpus_per_node: 4 + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + decode_nodes: 2 + decode_workers: 1 + gpus_per_decode: 8 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: '1' + SGLANG_RADIX_DISABLE_REUSE: '1' + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: '1' + SGLANG_DEFAULT_THINKING: '1' + SGLANG_DSV4_REASONING_EFFORT: max + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: '1' + SGLANG_OPT_USE_JIT_NORM: '1' + SGLANG_OPT_USE_JIT_INDEXER_METADATA: '1' + SGLANG_OPT_USE_TOPK_V2: '1' + + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: '1' + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: '1' + SGLANG_OPT_USE_FAST_MASK_EP: '1' + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: '1' + SGLANG_OPT_FIX_HASH_MEGA_MOE: '1' + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: '9216' + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: '1' + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: '1' + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '0' + + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + MC_FORCE_MNNVL: '1' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: '1' + + decode_environment: + PYTHONUNBUFFERED: '1' + SGLANG_RADIX_DISABLE_REUSE: '1' + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: '1' + SGLANG_DEFAULT_THINKING: '1' + SGLANG_DSV4_REASONING_EFFORT: max + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: '1' + SGLANG_OPT_USE_JIT_NORM: '1' + SGLANG_OPT_USE_JIT_INDEXER_METADATA: '1' + SGLANG_OPT_USE_TOPK_V2: '1' + + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: '1' + SGLANG_OPT_USE_FAST_MASK_EP: '1' + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: '1' + SGLANG_OPT_FIX_HASH_MEGA_MOE: '1' + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: '2048' + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: '1' + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: '1' + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '0' + + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + MC_FORCE_MNNVL: '1' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: '1' + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "0" # CAR_V2 is single-node only. + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + + sglang_config: + prefill: + served-model-name: deepseek-ai/DeepSeek-V4-Pro + model-path: "/model/" + trust-remote-code: true + tool-call-parser: deepseekv4 + + disaggregation-mode: prefill + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: deepep + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + mem-fraction-static: 0.9 + max-running-requests: 256 + cuda-graph-max-bs: 256 + chunked-prefill-size: 32768 + stream-interval: 60 + + decode: + served-model-name: deepseek-ai/DeepSeek-V4-Pro + model-path: "/model/" + trust-remote-code: true + tool-call-parser: deepseekv4 + + disaggregation-mode: decode + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 8 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: deepep + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + speculative-algo: "EAGLE" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + + mem-fraction-static: 0.94 + max-running-requests: 1536 + cuda-graph-max-bs: 256 + swa-full-tokens-ratio: 0.15 + context-length: 16384 + stream-interval: 60 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "8192" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-2p1d-dep16-conc8192-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-2p1d-dep16-conc8192-mtp.yaml new file mode 100644 index 000000000..5176c28f2 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-2p1d-dep16-conc8192-mtp.yaml @@ -0,0 +1,160 @@ +name: "dsv4-pro-gb300-disagg-1k1k-mtp-2p1d-dep16-conc8192" + +slurm: + time_limit: "03:00:00" + +sbatch_directives: + cpus-per-task: '144' + mem: '0' + +dynamo: + hash: "34d55a596fb8d3d44daefe425ec1e303131f4d2c" + install: true + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 8 + +model: + path: dsv4-pro + container: "lmsysorg/sglang:nightly-dev-cu13-20260603-83bc7766" + precision: fp4 + +resources: + gpu_type: gb300 + gpus_per_node: 4 + prefill_nodes: 2 + prefill_workers: 2 + gpus_per_prefill: 4 + decode_nodes: 4 + decode_workers: 1 + gpus_per_decode: 16 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: '1' + SGLANG_RADIX_DISABLE_REUSE: '1' + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: '1' + SGLANG_DEFAULT_THINKING: '1' + SGLANG_DSV4_REASONING_EFFORT: max + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: '1' + SGLANG_OPT_USE_JIT_NORM: '1' + SGLANG_OPT_USE_JIT_INDEXER_METADATA: '1' + SGLANG_OPT_USE_TOPK_V2: '1' + + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: '1' + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: '1' + SGLANG_OPT_USE_FAST_MASK_EP: '1' + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: '1' + SGLANG_OPT_FIX_HASH_MEGA_MOE: '1' + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: '9216' + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: '1' + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: '1' + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '0' + + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + MC_FORCE_MNNVL: '1' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: '1' + + decode_environment: + PYTHONUNBUFFERED: '1' + SGLANG_RADIX_DISABLE_REUSE: '1' + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: '1' + SGLANG_DEFAULT_THINKING: '1' + SGLANG_DSV4_REASONING_EFFORT: max + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: '1' + SGLANG_OPT_USE_JIT_NORM: '1' + SGLANG_OPT_USE_JIT_INDEXER_METADATA: '1' + SGLANG_OPT_USE_TOPK_V2: '1' + + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: '1' + SGLANG_OPT_USE_FAST_MASK_EP: '1' + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: '1' + SGLANG_OPT_FIX_HASH_MEGA_MOE: '1' + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: '4096' + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: '1' + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: '1' + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '0' + + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + MC_FORCE_MNNVL: '1' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: '1' + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "0" # CAR_V2 is single-node only. + + sglang_config: + prefill: + served-model-name: deepseek-ai/DeepSeek-V4-Pro + model-path: "/model/" + trust-remote-code: true + tool-call-parser: deepseekv4 + + disaggregation-mode: prefill + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: deepep + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + mem-fraction-static: 0.9 + max-running-requests: 1024 + cuda-graph-max-bs: 1024 + chunked-prefill-size: 32768 + stream-interval: 60 + + decode: + served-model-name: deepseek-ai/DeepSeek-V4-Pro + model-path: "/model/" + trust-remote-code: true + tool-call-parser: deepseekv4 + + disaggregation-mode: decode + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 16 + data-parallel-size: 16 + expert-parallel-size: 16 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: deepep + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + speculative-algo: "EAGLE" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + + mem-fraction-static: 0.9 + max-running-requests: 18432 + cuda-graph-max-bs: 1024 + swa-full-tokens-ratio: 0.15 + context-length: 16384 + stream-interval: 60 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "8192" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-dep4-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-dep4-mtp.yaml new file mode 100644 index 000000000..08aad8c43 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-dep4-mtp.yaml @@ -0,0 +1,148 @@ +base: + name: "dsv4-pro-gb300-disagg-1k1k-mtp-low-latency-dep4" + + frontend: + type: sglang + enable_multiple_frontends: false + args: + policy: "cache_aware" + + model: + path: "dsv4-pro" + container: "lmsysorg/sglang:nightly-dev-cu13-20260510-2473659e" + precision: "mxfp4" + + resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 1 + decode_workers: 1 + + backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 + # is single-node only and corrupts results in 2-node decode setups. + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + mem-fraction-static: 0.90 + max-running-requests: 512 + cuda-graph-max-bs: 512 + chunked-prefill-size: 4096 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + + speculative-algo: "EAGLE" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + + mem-fraction-static: 0.9 + max-running-requests: 512 + cuda-graph-max-bs: 512 + swa-full-tokens-ratio: 0.1 + context-length: 4096 + + benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "1x2x4x8x16x32x64" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" + + +zip_override_1k1k_lowlat: + resources: + decode_nodes: [1, 2, 4, 6] + decode_workers: [1, 2, 4, 6] + backend: + sglang_config: + decode: + max-running-requests: [64, 64, 64, 64] + cuda-graph-max-bs: [64, 64, 64, 64] + benchmark: + concurrencies: "1x2x4x8x16x32x64" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-tp4-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-tp4-mtp.yaml new file mode 100644 index 000000000..e7f5f0f34 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-tp4-mtp.yaml @@ -0,0 +1,134 @@ +base: + name: "dsv4-pro-gb300-disagg-1k1k-mtp-low-latency-tp4" + + frontend: + type: sglang + enable_multiple_frontends: false + args: + policy: "cache_aware" + + model: + path: "dsv4-pro" + container: "lmsysorg/sglang:nightly-dev-cu13-20260510-2473659e" + precision: "mxfp4" + + resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 1 + decode_workers: 1 + + backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 + # is single-node only and corrupts results in 2-node decode setups. + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + + mem-fraction-static: 0.90 + max-running-requests: 8 + cuda-graph-max-bs: 8 + chunked-prefill-size: 4096 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + + speculative-algo: "EAGLE" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + + mem-fraction-static: 0.9 + max-running-requests: 8 + cuda-graph-max-bs: 8 + swa-full-tokens-ratio: 0.1 + context-length: 4096 + + benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "1x2" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" + + +zip_override_1k1k_lowlat: + resources: + decode_nodes: [1, 2, 4, 6] + decode_workers: [1, 2, 4, 6] + backend: + sglang_config: + decode: + max-running-requests: [8, 8, 8, 8] + cuda-graph-max-bs: [8, 8, 8, 8] + benchmark: + concurrencies: "1x2" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 5622173f1..80cbcbb21 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3531,3 +3531,14 @@ - "The Rust frontend replaces only the Python serving/API layer (HTTP, tokenization, scheduling glue, detokenization) and spawns the same Python EngineCore, so GPU kernels/attention/MoE GEMM/KV cache are untouched" - "A/B sweep (28 single-node points, 1k1k + 8k1k, TP 1/2/4) vs the Python-frontend baseline (run 26696260751): throughput Pareto-neutral (peak tok/s/GPU within <1.5%, frontiers coincident) and TPOT flat (+-0.5%); TTFT improves ~8% at 1k1k and ~22% at 8k1k (every point), the expected signature of lower frontend CPU latency before first token, scaling with input length" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1634 + +- config-keys: + - dsv4-fp4-gb300-dynamo-sglang-mtp-1k1k + description: + - "Add DeepSeek-V4-Pro FP4 GB300 disaggregated SGLang MTP 1k/1k coverage using lmsysorg/sglang:nightly-dev-cu13-20260603-83bc7766" + - "New top-level config (sibling of dsv4-fp4-gb300-dynamo-sglang-mtp which stays on the older 8k/1k image nightly-dev-20260527-14f81a67); separate entry is required because the launcher builds one squashfs per top-level image and the 1k/1k recipes pin a newer container string" + - "Wires 5 disagg recipes adapted from NVIDIA/srt-slurm PR #177 (recipes/dsv4-pro/sglang/gb300-fp4/1k1k/disagg/mtp/), staged locally under benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/: 3 high-throughput conc=8192 recipes (1p1d-dep8, 1p1d-dep16, 2p1d-dep16) plus 2 low-latency templates (disagg-low-latency-dep4-mtp matrix-point at conc=64, disagg-low-latency-tp4-mtp matrix-point at conc=2)" + - "Decode side runs EAGLE (num-steps=3, eagle-topk=1, num-draft-tokens=4) on all 5 recipes" + - "The 2 low-latency recipe yamls retain their upstream container reference (lmsysorg/sglang:nightly-dev-cu13-20260510-2473659e); that image is not currently on Docker Hub, so the squash-import step would 404 without a launcher-side container alias or an upstream image push" + - "Matrix tracks one representative point per low-latency template; the recipe itself uses srt-slurm's base+zip_override_1k1k_lowlat syntax to internally generate 4 decode_nodes variants (1/2/4/6) per template" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX From 984a91ed5be78f1baf3f806f84d58f51e9437f43 Mon Sep 17 00:00:00 2001 From: hshrivastava-droid Date: Tue, 9 Jun 2026 16:29:57 -0700 Subject: [PATCH 2/6] update configs --- .github/configs/nvidia-master.yaml | 44 +++++- .../disagg-low-latency-1p1d-tp4-tp4-mtp.yaml | 120 ++++++++++++++++ .../disagg-low-latency-1p2d-tp4-tp4-mtp.yaml | 120 ++++++++++++++++ .../disagg-low-latency-1p4d-tp4-tp4-mtp.yaml | 120 ++++++++++++++++ .../disagg-low-latency-1p6d-tp4-tp4-mtp.yaml | 120 ++++++++++++++++ .../1k1k/disagg-low-latency-tp4-mtp.yaml | 134 ------------------ perf-changelog.yaml | 9 +- 7 files changed, 527 insertions(+), 140 deletions(-) create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p2d-tp4-tp4-mtp.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p4d-tp4-tp4-mtp.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p6d-tp4-tp4-mtp.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-tp4-mtp.yaml diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 54e875833..0a71d9d0e 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -9295,12 +9295,54 @@ dsv4-fp4-gb300-dynamo-sglang-mtp-1k1k: ep: 1 dp-attn: false additional-settings: - - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-tp4-mtp.yaml" + - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml" decode: num-worker: 1 tp: 4 ep: 1 dp-attn: false + - spec-decoding: "mtp" + conc-list: [2] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p2d-tp4-tp4-mtp.yaml" + decode: + num-worker: 2 + tp: 4 + ep: 1 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [2] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p4d-tp4-tp4-mtp.yaml" + decode: + num-worker: 4 + tp: 4 + ep: 1 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [2] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p6d-tp4-tp4-mtp.yaml" + decode: + num-worker: 6 + tp: 4 + ep: 1 + dp-attn: false kimik2.5-int4-h100-vllm: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml new file mode 100644 index 000000000..97935ebec --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml @@ -0,0 +1,120 @@ +name: "dsv4-pro-gb300-disagg-1k1k-mtp-low-latency-1p1d-tp4-tp4" + +frontend: + type: sglang + enable_multiple_frontends: false + args: + policy: "cache_aware" + +model: + path: "dsv4-pro" + container: "lmsysorg/sglang:nightly-dev-cu13-20260510-2473659e" + precision: "mxfp4" + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 1 + decode_workers: 1 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 + # is single-node only and corrupts results in 2-node decode setups. + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + + mem-fraction-static: 0.90 + max-running-requests: 8 + cuda-graph-max-bs: 8 + chunked-prefill-size: 4096 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + + speculative-algo: "EAGLE" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + + mem-fraction-static: 0.9 + max-running-requests: 8 + cuda-graph-max-bs: 8 + swa-full-tokens-ratio: 0.1 + context-length: 4096 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "1x2" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p2d-tp4-tp4-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p2d-tp4-tp4-mtp.yaml new file mode 100644 index 000000000..454c5cd09 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p2d-tp4-tp4-mtp.yaml @@ -0,0 +1,120 @@ +name: "dsv4-pro-gb300-disagg-1k1k-mtp-low-latency-1p2d-tp4-tp4" + +frontend: + type: sglang + enable_multiple_frontends: false + args: + policy: "cache_aware" + +model: + path: "dsv4-pro" + container: "lmsysorg/sglang:nightly-dev-cu13-20260510-2473659e" + precision: "mxfp4" + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 2 + decode_workers: 2 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 + # is single-node only and corrupts results in 2-node decode setups. + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + + mem-fraction-static: 0.90 + max-running-requests: 8 + cuda-graph-max-bs: 8 + chunked-prefill-size: 4096 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + + speculative-algo: "EAGLE" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + + mem-fraction-static: 0.9 + max-running-requests: 8 + cuda-graph-max-bs: 8 + swa-full-tokens-ratio: 0.1 + context-length: 4096 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "1x2" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p4d-tp4-tp4-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p4d-tp4-tp4-mtp.yaml new file mode 100644 index 000000000..6b77bf113 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p4d-tp4-tp4-mtp.yaml @@ -0,0 +1,120 @@ +name: "dsv4-pro-gb300-disagg-1k1k-mtp-low-latency-1p4d-tp4-tp4" + +frontend: + type: sglang + enable_multiple_frontends: false + args: + policy: "cache_aware" + +model: + path: "dsv4-pro" + container: "lmsysorg/sglang:nightly-dev-cu13-20260510-2473659e" + precision: "mxfp4" + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 4 + decode_workers: 4 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 + # is single-node only and corrupts results in 2-node decode setups. + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + + mem-fraction-static: 0.90 + max-running-requests: 8 + cuda-graph-max-bs: 8 + chunked-prefill-size: 4096 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + + speculative-algo: "EAGLE" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + + mem-fraction-static: 0.9 + max-running-requests: 8 + cuda-graph-max-bs: 8 + swa-full-tokens-ratio: 0.1 + context-length: 4096 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "1x2" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p6d-tp4-tp4-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p6d-tp4-tp4-mtp.yaml new file mode 100644 index 000000000..cb657a956 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p6d-tp4-tp4-mtp.yaml @@ -0,0 +1,120 @@ +name: "dsv4-pro-gb300-disagg-1k1k-mtp-low-latency-1p6d-tp4-tp4" + +frontend: + type: sglang + enable_multiple_frontends: false + args: + policy: "cache_aware" + +model: + path: "dsv4-pro" + container: "lmsysorg/sglang:nightly-dev-cu13-20260510-2473659e" + precision: "mxfp4" + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 6 + decode_workers: 6 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 + # is single-node only and corrupts results in 2-node decode setups. + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + + mem-fraction-static: 0.90 + max-running-requests: 8 + cuda-graph-max-bs: 8 + chunked-prefill-size: 4096 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + + speculative-algo: "EAGLE" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + + mem-fraction-static: 0.9 + max-running-requests: 8 + cuda-graph-max-bs: 8 + swa-full-tokens-ratio: 0.1 + context-length: 4096 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "1x2" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-tp4-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-tp4-mtp.yaml deleted file mode 100644 index e7f5f0f34..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-tp4-mtp.yaml +++ /dev/null @@ -1,134 +0,0 @@ -base: - name: "dsv4-pro-gb300-disagg-1k1k-mtp-low-latency-tp4" - - frontend: - type: sglang - enable_multiple_frontends: false - args: - policy: "cache_aware" - - model: - path: "dsv4-pro" - container: "lmsysorg/sglang:nightly-dev-cu13-20260510-2473659e" - precision: "mxfp4" - - resources: - gpu_type: "gb300" - gpus_per_node: 4 - prefill_nodes: 1 - prefill_workers: 1 - decode_nodes: 1 - decode_workers: 1 - - backend: - type: sglang - - prefill_environment: - PYTHONUNBUFFERED: "1" - SGLANG_RADIX_DISABLE_REUSE: "1" - SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" - SGLANG_DEFAULT_THINKING: "1" - SGLANG_DSV4_REASONING_EFFORT: "max" - SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" - SGLANG_OPT_USE_JIT_NORM: "1" - SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" - SGLANG_OPT_USE_TOPK_V2: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - MC_FORCE_MNNVL: "1" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" - - decode_environment: - PYTHONUNBUFFERED: "1" - SGLANG_RADIX_DISABLE_REUSE: "1" - SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" - SGLANG_DEFAULT_THINKING: "1" - SGLANG_DSV4_REASONING_EFFORT: "max" - SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" - SGLANG_OPT_USE_JIT_NORM: "1" - SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" - SGLANG_OPT_USE_TOPK_V2: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - MC_FORCE_MNNVL: "1" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" - # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 - # is single-node only and corrupts results in 2-node decode setups. - - sglang_config: - prefill: - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - model-path: "/model/" - trust-remote-code: true - disable-radix-cache: true - - disaggregation-mode: "prefill" - disaggregation-transfer-backend: mooncake - - tensor-parallel-size: 4 - data-parallel-size: 1 - expert-parallel-size: 1 - - moe-runner-backend: "flashinfer_mxfp4" - disable-flashinfer-autotune: true - - mem-fraction-static: 0.90 - max-running-requests: 8 - cuda-graph-max-bs: 8 - chunked-prefill-size: 4096 - - decode: - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - model-path: "/model/" - trust-remote-code: true - disable-radix-cache: true - - disaggregation-mode: "decode" - disaggregation-transfer-backend: mooncake - - tensor-parallel-size: 4 - data-parallel-size: 1 - expert-parallel-size: 1 - - moe-runner-backend: "flashinfer_mxfp4" - disable-flashinfer-autotune: true - - speculative-algo: "EAGLE" - speculative-num-steps: 3 - speculative-eagle-topk: 1 - speculative-num-draft-tokens: 4 - - mem-fraction-static: 0.9 - max-running-requests: 8 - cuda-graph-max-bs: 8 - swa-full-tokens-ratio: 0.1 - context-length: 4096 - - benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - random_range_ratio: 0.8 - concurrencies: "1x2" - req_rate: "inf" - use_chat_template: true - custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" - - -zip_override_1k1k_lowlat: - resources: - decode_nodes: [1, 2, 4, 6] - decode_workers: [1, 2, 4, 6] - backend: - sglang_config: - decode: - max-running-requests: [8, 8, 8, 8] - cuda-graph-max-bs: [8, 8, 8, 8] - benchmark: - concurrencies: "1x2" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 80cbcbb21..4a7993915 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3537,8 +3537,7 @@ description: - "Add DeepSeek-V4-Pro FP4 GB300 disaggregated SGLang MTP 1k/1k coverage using lmsysorg/sglang:nightly-dev-cu13-20260603-83bc7766" - "New top-level config (sibling of dsv4-fp4-gb300-dynamo-sglang-mtp which stays on the older 8k/1k image nightly-dev-20260527-14f81a67); separate entry is required because the launcher builds one squashfs per top-level image and the 1k/1k recipes pin a newer container string" - - "Wires 5 disagg recipes adapted from NVIDIA/srt-slurm PR #177 (recipes/dsv4-pro/sglang/gb300-fp4/1k1k/disagg/mtp/), staged locally under benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/: 3 high-throughput conc=8192 recipes (1p1d-dep8, 1p1d-dep16, 2p1d-dep16) plus 2 low-latency templates (disagg-low-latency-dep4-mtp matrix-point at conc=64, disagg-low-latency-tp4-mtp matrix-point at conc=2)" - - "Decode side runs EAGLE (num-steps=3, eagle-topk=1, num-draft-tokens=4) on all 5 recipes" - - "The 2 low-latency recipe yamls retain their upstream container reference (lmsysorg/sglang:nightly-dev-cu13-20260510-2473659e); that image is not currently on Docker Hub, so the squash-import step would 404 without a launcher-side container alias or an upstream image push" - - "Matrix tracks one representative point per low-latency template; the recipe itself uses srt-slurm's base+zip_override_1k1k_lowlat syntax to internally generate 4 decode_nodes variants (1/2/4/6) per template" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX + - "Wires 8 disagg recipes adapted from NVIDIA/srt-slurm PR #177 (recipes/dsv4-pro/sglang/gb300-fp4/1k1k/disagg/mtp/), staged locally under benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/: 3 high-throughput conc=8192 recipes (1p1d-dep8, 1p1d-dep16, 2p1d-dep16), 1 low-latency dep4 template (disagg-low-latency-dep4-mtp at conc=64), and 4 flat low-latency tp4 recipes (1p1d/1p2d/1p4d/1p6d at conc=2)" + - "Flat tp4 split (1p1d/1p2d/1p4d/1p6d-tp4-tp4) replaces the upstream disagg-low-latency-tp4-mtp.yaml template (base + zip_override_1k1k_lowlat with decode_nodes [1,2,4,6]); the templated form silently broke under the gb300-cw launcher because its top-level name-injection (`sed -i \"s/^name:.*/name: ...\"`) doesn't match base.name's 2-space indent, prepending a duplicate top-level name: key. Single-config files match the 8k1k convention (one recipe per decode-worker count) and side-step the issue" + - "Decode side runs EAGLE (num-steps=3, eagle-topk=1, num-draft-tokens=4) on all 8 recipes; recipe container strings are kept upstream-faithful (high-tput recipes pin nightly-dev-cu13-20260603-83bc7766, low-latency recipes pin nightly-dev-cu13-20260510-2473659e)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1697 From 7c91283fcd52f5c84fae78c211382dfa1f346f89 Mon Sep 17 00:00:00 2001 From: hshrivastava-droid Date: Tue, 9 Jun 2026 21:36:48 -0700 Subject: [PATCH 3/6] split configs --- .github/configs/nvidia-master.yaml | 44 +++++- .../disagg-low-latency-1p1d-dep4-tp4-mtp.yaml | 134 ++++++++++++++++ .../disagg-low-latency-1p2d-dep4-tp4-mtp.yaml | 134 ++++++++++++++++ .../disagg-low-latency-1p4d-dep4-tp4-mtp.yaml | 134 ++++++++++++++++ .../disagg-low-latency-1p6d-dep4-tp4-mtp.yaml | 134 ++++++++++++++++ .../1k1k/disagg-low-latency-dep4-mtp.yaml | 148 ------------------ perf-changelog.yaml | 6 +- 7 files changed, 582 insertions(+), 152 deletions(-) create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p1d-dep4-tp4-mtp.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p2d-dep4-tp4-mtp.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p4d-dep4-tp4-mtp.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-dep4-mtp.yaml diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 0a71d9d0e..a6d42207c 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -9281,12 +9281,54 @@ dsv4-fp4-gb300-dynamo-sglang-mtp-1k1k: ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-dep4-mtp.yaml" + - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p1d-dep4-tp4-mtp.yaml" decode: num-worker: 1 tp: 4 ep: 1 dp-attn: false + - spec-decoding: "mtp" + conc-list: [64] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p2d-dep4-tp4-mtp.yaml" + decode: + num-worker: 2 + tp: 4 + ep: 1 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [64] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p4d-dep4-tp4-mtp.yaml" + decode: + num-worker: 4 + tp: 4 + ep: 1 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [64] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml" + decode: + num-worker: 6 + tp: 4 + ep: 1 + dp-attn: false - spec-decoding: "mtp" conc-list: [2] prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p1d-dep4-tp4-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p1d-dep4-tp4-mtp.yaml new file mode 100644 index 000000000..92a7d98a5 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p1d-dep4-tp4-mtp.yaml @@ -0,0 +1,134 @@ +name: "dsv4-pro-gb300-disagg-1k1k-mtp-low-latency-1p1d-dep4-tp4" + +frontend: + type: sglang + enable_multiple_frontends: false + args: + policy: "cache_aware" + +model: + path: "dsv4-pro" + container: "lmsysorg/sglang:nightly-dev-cu13-20260510-2473659e" + precision: "mxfp4" + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 1 + decode_workers: 1 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 + # is single-node only and corrupts results in 2-node decode setups. + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + mem-fraction-static: 0.90 + max-running-requests: 512 + cuda-graph-max-bs: 512 + chunked-prefill-size: 4096 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + + speculative-algo: "EAGLE" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + + mem-fraction-static: 0.9 + max-running-requests: 64 + cuda-graph-max-bs: 64 + swa-full-tokens-ratio: 0.1 + context-length: 4096 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "1x2x4x8x16x32x64" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p2d-dep4-tp4-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p2d-dep4-tp4-mtp.yaml new file mode 100644 index 000000000..fe154f889 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p2d-dep4-tp4-mtp.yaml @@ -0,0 +1,134 @@ +name: "dsv4-pro-gb300-disagg-1k1k-mtp-low-latency-1p2d-dep4-tp4" + +frontend: + type: sglang + enable_multiple_frontends: false + args: + policy: "cache_aware" + +model: + path: "dsv4-pro" + container: "lmsysorg/sglang:nightly-dev-cu13-20260510-2473659e" + precision: "mxfp4" + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 2 + decode_workers: 2 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 + # is single-node only and corrupts results in 2-node decode setups. + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + mem-fraction-static: 0.90 + max-running-requests: 512 + cuda-graph-max-bs: 512 + chunked-prefill-size: 4096 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + + speculative-algo: "EAGLE" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + + mem-fraction-static: 0.9 + max-running-requests: 64 + cuda-graph-max-bs: 64 + swa-full-tokens-ratio: 0.1 + context-length: 4096 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "1x2x4x8x16x32x64" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p4d-dep4-tp4-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p4d-dep4-tp4-mtp.yaml new file mode 100644 index 000000000..148d27707 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p4d-dep4-tp4-mtp.yaml @@ -0,0 +1,134 @@ +name: "dsv4-pro-gb300-disagg-1k1k-mtp-low-latency-1p4d-dep4-tp4" + +frontend: + type: sglang + enable_multiple_frontends: false + args: + policy: "cache_aware" + +model: + path: "dsv4-pro" + container: "lmsysorg/sglang:nightly-dev-cu13-20260510-2473659e" + precision: "mxfp4" + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 4 + decode_workers: 4 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 + # is single-node only and corrupts results in 2-node decode setups. + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + mem-fraction-static: 0.90 + max-running-requests: 512 + cuda-graph-max-bs: 512 + chunked-prefill-size: 4096 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + + speculative-algo: "EAGLE" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + + mem-fraction-static: 0.9 + max-running-requests: 64 + cuda-graph-max-bs: 64 + swa-full-tokens-ratio: 0.1 + context-length: 4096 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "1x2x4x8x16x32x64" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml new file mode 100644 index 000000000..9e7da0458 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml @@ -0,0 +1,134 @@ +name: "dsv4-pro-gb300-disagg-1k1k-mtp-low-latency-1p6d-dep4-tp4" + +frontend: + type: sglang + enable_multiple_frontends: false + args: + policy: "cache_aware" + +model: + path: "dsv4-pro" + container: "lmsysorg/sglang:nightly-dev-cu13-20260510-2473659e" + precision: "mxfp4" + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 6 + decode_workers: 6 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 + # is single-node only and corrupts results in 2-node decode setups. + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + mem-fraction-static: 0.90 + max-running-requests: 512 + cuda-graph-max-bs: 512 + chunked-prefill-size: 4096 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + + speculative-algo: "EAGLE" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + + mem-fraction-static: 0.9 + max-running-requests: 64 + cuda-graph-max-bs: 64 + swa-full-tokens-ratio: 0.1 + context-length: 4096 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "1x2x4x8x16x32x64" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-dep4-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-dep4-mtp.yaml deleted file mode 100644 index 08aad8c43..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-dep4-mtp.yaml +++ /dev/null @@ -1,148 +0,0 @@ -base: - name: "dsv4-pro-gb300-disagg-1k1k-mtp-low-latency-dep4" - - frontend: - type: sglang - enable_multiple_frontends: false - args: - policy: "cache_aware" - - model: - path: "dsv4-pro" - container: "lmsysorg/sglang:nightly-dev-cu13-20260510-2473659e" - precision: "mxfp4" - - resources: - gpu_type: "gb300" - gpus_per_node: 4 - prefill_nodes: 1 - prefill_workers: 1 - decode_nodes: 1 - decode_workers: 1 - - backend: - type: sglang - - prefill_environment: - PYTHONUNBUFFERED: "1" - SGLANG_RADIX_DISABLE_REUSE: "1" - SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" - SGLANG_DEFAULT_THINKING: "1" - SGLANG_DSV4_REASONING_EFFORT: "max" - SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" - SGLANG_OPT_USE_JIT_NORM: "1" - SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" - SGLANG_OPT_USE_TOPK_V2: "1" - - SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" - SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" - SGLANG_OPT_USE_FAST_MASK_EP: "1" - SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" - SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" - SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216" - SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" - SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" - - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - MC_FORCE_MNNVL: "1" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" - - decode_environment: - PYTHONUNBUFFERED: "1" - SGLANG_RADIX_DISABLE_REUSE: "1" - SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" - SGLANG_DEFAULT_THINKING: "1" - SGLANG_DSV4_REASONING_EFFORT: "max" - SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" - SGLANG_OPT_USE_JIT_NORM: "1" - SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" - SGLANG_OPT_USE_TOPK_V2: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - MC_FORCE_MNNVL: "1" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" - # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 - # is single-node only and corrupts results in 2-node decode setups. - - sglang_config: - prefill: - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - model-path: "/model/" - trust-remote-code: true - disable-radix-cache: true - - disaggregation-mode: "prefill" - disaggregation-transfer-backend: mooncake - - tensor-parallel-size: 4 - data-parallel-size: 4 - expert-parallel-size: 4 - - enable-dp-attention: true - enable-dp-lm-head: true - - moe-a2a-backend: "deepep" - deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' - - mem-fraction-static: 0.90 - max-running-requests: 512 - cuda-graph-max-bs: 512 - chunked-prefill-size: 4096 - - decode: - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - model-path: "/model/" - trust-remote-code: true - disable-radix-cache: true - - disaggregation-mode: "decode" - disaggregation-transfer-backend: mooncake - - tensor-parallel-size: 4 - data-parallel-size: 1 - expert-parallel-size: 1 - - moe-runner-backend: "flashinfer_mxfp4" - disable-flashinfer-autotune: true - - speculative-algo: "EAGLE" - speculative-num-steps: 3 - speculative-eagle-topk: 1 - speculative-num-draft-tokens: 4 - - mem-fraction-static: 0.9 - max-running-requests: 512 - cuda-graph-max-bs: 512 - swa-full-tokens-ratio: 0.1 - context-length: 4096 - - benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - random_range_ratio: 0.8 - concurrencies: "1x2x4x8x16x32x64" - req_rate: "inf" - use_chat_template: true - custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" - - -zip_override_1k1k_lowlat: - resources: - decode_nodes: [1, 2, 4, 6] - decode_workers: [1, 2, 4, 6] - backend: - sglang_config: - decode: - max-running-requests: [64, 64, 64, 64] - cuda-graph-max-bs: [64, 64, 64, 64] - benchmark: - concurrencies: "1x2x4x8x16x32x64" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 4a7993915..0b8ab1cb6 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3537,7 +3537,7 @@ description: - "Add DeepSeek-V4-Pro FP4 GB300 disaggregated SGLang MTP 1k/1k coverage using lmsysorg/sglang:nightly-dev-cu13-20260603-83bc7766" - "New top-level config (sibling of dsv4-fp4-gb300-dynamo-sglang-mtp which stays on the older 8k/1k image nightly-dev-20260527-14f81a67); separate entry is required because the launcher builds one squashfs per top-level image and the 1k/1k recipes pin a newer container string" - - "Wires 8 disagg recipes adapted from NVIDIA/srt-slurm PR #177 (recipes/dsv4-pro/sglang/gb300-fp4/1k1k/disagg/mtp/), staged locally under benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/: 3 high-throughput conc=8192 recipes (1p1d-dep8, 1p1d-dep16, 2p1d-dep16), 1 low-latency dep4 template (disagg-low-latency-dep4-mtp at conc=64), and 4 flat low-latency tp4 recipes (1p1d/1p2d/1p4d/1p6d at conc=2)" - - "Flat tp4 split (1p1d/1p2d/1p4d/1p6d-tp4-tp4) replaces the upstream disagg-low-latency-tp4-mtp.yaml template (base + zip_override_1k1k_lowlat with decode_nodes [1,2,4,6]); the templated form silently broke under the gb300-cw launcher because its top-level name-injection (`sed -i \"s/^name:.*/name: ...\"`) doesn't match base.name's 2-space indent, prepending a duplicate top-level name: key. Single-config files match the 8k1k convention (one recipe per decode-worker count) and side-step the issue" - - "Decode side runs EAGLE (num-steps=3, eagle-topk=1, num-draft-tokens=4) on all 8 recipes; recipe container strings are kept upstream-faithful (high-tput recipes pin nightly-dev-cu13-20260603-83bc7766, low-latency recipes pin nightly-dev-cu13-20260510-2473659e)" + - "Wires 11 disagg recipes adapted from NVIDIA/srt-slurm PR #177 (recipes/dsv4-pro/sglang/gb300-fp4/1k1k/disagg/mtp/), staged locally under benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/: 3 high-throughput conc=8192 recipes (1p1d-dep8, 1p1d-dep16, 2p1d-dep16), 4 flat low-latency dep4 recipes (1p1d/1p2d/1p4d/1p6d-dep4-tp4 at conc=64), and 4 flat low-latency tp4 recipes (1p1d/1p2d/1p4d/1p6d-tp4-tp4 at conc=2)" + - "Flat dep4 and tp4 splits replace the upstream disagg-low-latency-dep4-mtp.yaml and disagg-low-latency-tp4-mtp.yaml templates (each base + zip_override_1k1k_lowlat with decode_nodes [1,2,4,6]). The templated form silently broke under the gb300-cw launcher: srtctl fans the single recipe into 4 Slurm submissions per `srtctl apply`, but launch_gb300-cw.sh:282 captures the job IDs with `grep -oP '✅ Job \\K[0-9]+'` into a scalar JOB_ID, jamming 4 newline-separated IDs into one variable. Downstream uses (LOGS_DIR path interpolation at :293-294, squeue polls at :313-314, error message at :315) all corrupt, producing the observed `ERROR: Job 7079\\n7082 failed before creating log file` and orphaning the real Slurm jobs. Single-config files match the 8k1k convention (one recipe per decode-worker count) and force srtctl to emit exactly 1 job per launcher invocation" + - "Decode side runs EAGLE (num-steps=3, eagle-topk=1, num-draft-tokens=4) on all 11 recipes; recipe container strings are kept upstream-faithful (high-tput recipes pin nightly-dev-cu13-20260603-83bc7766, low-latency recipes pin nightly-dev-cu13-20260510-2473659e)" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1697 From 47460ef649a9df9412b6e381ca109b18244de5ee Mon Sep 17 00:00:00 2001 From: hshrivastava-droid Date: Thu, 11 Jun 2026 10:10:46 -0700 Subject: [PATCH 4/6] update runner --- .github/configs/nvidia-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index a6d42207c..f00031cdf 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -9221,7 +9221,7 @@ dsv4-fp4-gb300-dynamo-sglang-mtp-1k1k: image: lmsysorg/sglang:nightly-dev-cu13-20260603-83bc7766 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 - runner: gb300-cw + runner: gb300-nv precision: fp4 framework: dynamo-sglang multinode: true From fd84ba54e7995d04bdedde9dd9e093dc20506198 Mon Sep 17 00:00:00 2001 From: hshrivastava-droid Date: Thu, 11 Jun 2026 10:47:30 -0700 Subject: [PATCH 5/6] Update perf-changelog.yaml --- perf-changelog.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 7f117b866..ba26f8d57 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3589,4 +3589,3 @@ - "Flat dep4 and tp4 splits replace the upstream disagg-low-latency-dep4-mtp.yaml and disagg-low-latency-tp4-mtp.yaml templates (each base + zip_override_1k1k_lowlat with decode_nodes [1,2,4,6]). The templated form silently broke under the gb300-cw launcher: srtctl fans the single recipe into 4 Slurm submissions per `srtctl apply`, but launch_gb300-cw.sh:282 captures the job IDs with `grep -oP '✅ Job \\K[0-9]+'` into a scalar JOB_ID, jamming 4 newline-separated IDs into one variable. Downstream uses (LOGS_DIR path interpolation at :293-294, squeue polls at :313-314, error message at :315) all corrupt, producing the observed `ERROR: Job 7079\\n7082 failed before creating log file` and orphaning the real Slurm jobs. Single-config files match the 8k1k convention (one recipe per decode-worker count) and force srtctl to emit exactly 1 job per launcher invocation" - "Decode side runs EAGLE (num-steps=3, eagle-topk=1, num-draft-tokens=4) on all 11 recipes; recipe container strings are kept upstream-faithful (high-tput recipes pin nightly-dev-cu13-20260603-83bc7766, low-latency recipes pin nightly-dev-cu13-20260510-2473659e)" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1697 - \ No newline at end of file From 37100c4c5bab8cb402e74f467819b116cb06efdf Mon Sep 17 00:00:00 2001 From: Bryan Shan <58582368+Oseltamivir@users.noreply.github.com> Date: Thu, 11 Jun 2026 12:59:10 -0700 Subject: [PATCH 6/6] Update nvidia-master.yaml --- .github/configs/nvidia-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index c73ab5428..d1ad86d80 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -9247,7 +9247,7 @@ dsv4-fp4-gb300-dynamo-sglang-mtp-1k1k: image: lmsysorg/sglang:nightly-dev-cu13-20260603-83bc7766 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 - runner: gb300-nv + runner: gb300 precision: fp4 framework: dynamo-sglang multinode: true