From 15ab4c75f6219d1fa3b2b8a1cdf4427a839170c8 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Fri, 12 Jun 2026 05:13:51 +0000 Subject: [PATCH 1/4] [AMD] remove accuracy wrong sweep point from dsr1 fp4 sgl disagg on mi355x --- .github/configs/amd-master.yaml | 268 +------------------------------- 1 file changed, 1 insertion(+), 267 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 8909c0e28..603c82b56 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2022,7 +2022,7 @@ dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp: # 1P2D TP8 - spec-decoding: "mtp" - conc-list: [ 32, 64 ] + conc-list: [ 32 ] prefill: num-worker: 1 tp: 8 @@ -2077,45 +2077,6 @@ dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=3" - - # 1*DEP8 + 1*DEP8 - - spec-decoding: "mtp" - conc-list: [ 128 ] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=3" - - # 1*DEP8 + 1*DEP8 - - spec-decoding: "mtp" - conc-list: [ 64 ] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=3" - # 2*DEP8 + 1*DEP8 - spec-decoding: "mtp" conc-list: [ 1024, 2048, 4096 ] @@ -2437,233 +2398,6 @@ dsv4-fp4-mi355x-vllm-agentic: - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 10, 12, 16] } - { tp: 4, ep: 4, dp-attn: true, offloading: none, conc-list: [16, 24, 32, 40, 48] } -dsr1-fp4-mi355x-sglang-disagg-mtp: - image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260519 - model: amd/DeepSeek-R1-0528-MXFP4-v2 - model-prefix: dsr1 - runner: mi355x-disagg - precision: fp4 - framework: sglang-disagg - multinode: true - disagg: true - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - # MTP configurations - # 1P1D TP8 - - spec-decoding: "mtp" - conc-list: [ 1, 2, 4, 8 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=3" - - # 1P2D TP8 - - spec-decoding: "mtp" - conc-list: [ 2, 4, 8, 16, 32 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=3" - - # 1P2D TP8 - - spec-decoding: "mtp" - conc-list: [ 64, 128, 256 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=2" - - # 1P2D TP4 - - spec-decoding: "mtp" - conc-list: [ 64, 128, 256 ] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=2" - - # 1*DEP4+ 1*DEP8 - - spec-decoding: "mtp" - conc-list: [ 1024, 2048, 4096 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=1" - - - isl: 8192 - osl: 1024 - search-space: - # MTP configurations - # 1P1D pure TP8 - - spec-decoding: "mtp" - conc-list: [ 1, 2, 4, 8 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=3" - - # 1P2D TP8 - - spec-decoding: "mtp" - conc-list: [ 2, 4, 8, 16, 32 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=3" - - # 1P2D TP8 - - spec-decoding: "mtp" - conc-list: [ 64, 128, 256 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=2" - - # 1*DEP8 + 1*DEP8 - - spec-decoding: "mtp" - conc-list: [ 128, 512 ] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=1" - - # 1*DEP8 + 1*DEP8 - - spec-decoding: "mtp" - conc-list: [ 64, 256 ] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=1" - - # 2*DEP8 + 1*DEP8 - - spec-decoding: "mtp" - conc-list: [ 1024, 2048, 4096 ] - prefill: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "PREFILL_NODES=2" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=1" # DSv4-Pro FP4 on MI355X via SGLang. Uses a rocm720 mi35x image built off the From 69dd3d83639c18ad0de4a177dbe6d4e83bf5c5e2 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Fri, 12 Jun 2026 05:16:58 +0000 Subject: [PATCH 2/4] add perf change log --- perf-changelog.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index e54dc7b6e..91b18eeef 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3590,3 +3590,9 @@ - "MI355x DSR1-FP4: Include TP4 configurations for 8k1k" - "Expand the TP sweep (included TP=4) for 8k/1k configuration for conc=4 to 64" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1692 + +- config-keys: + - dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp + description: + - "Remove conc128,64 for dep8 case" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1714 From b37a18dc7f22c50947866025ffd26455a0850d14 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Fri, 12 Jun 2026 05:33:36 +0000 Subject: [PATCH 3/4] fix --- .github/configs/amd-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 25543164b..2b63fabf1 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2022,7 +2022,7 @@ dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp: # 1P2D TP8 - spec-decoding: "mtp" - conc-list: [ 32 ] + conc-list: [ 32, 64 ] prefill: num-worker: 1 tp: 8 From 4b531d91d2dbdba77a9bb26323ecc0da80c3e8ef Mon Sep 17 00:00:00 2001 From: billishyahao Date: Fri, 12 Jun 2026 06:38:42 +0000 Subject: [PATCH 4/4] fix --- benchmarks/multi_node/amd_utils/server_sglang.sh | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/server_sglang.sh b/benchmarks/multi_node/amd_utils/server_sglang.sh index c28ccab41..51394e8a6 100755 --- a/benchmarks/multi_node/amd_utils/server_sglang.sh +++ b/benchmarks/multi_node/amd_utils/server_sglang.sh @@ -193,8 +193,12 @@ if [[ "$PREFILL_ENABLE_DP" == "true" ]] && [[ "$PREFILL_ENABLE_EP" == "true" ]]; prefill_max_running_requests=$BENCH_MAX_CONC_VALUE prefill_dp_ranks=$PREFILL_TP_SIZE # MORI_MAX_DISPATCH_TOKENS_PREFILL stays at 8192 (no change) - MORI_MOE_MAX_INPUT_TOKENS_PREFILL=$((MORI_MAX_DISPATCH_TOKENS_PREFILL * prefill_dp_ranks / 2)) - echo "[DP+EP override] Prefill: max-running-requests=$prefill_max_running_requests, MOE_MAX_INPUT=$MORI_MOE_MAX_INPUT_TOKENS_PREFILL" + if [[ "$prefill_max_running_requests" -gt 128 ]]; then + MORI_MOE_MAX_INPUT_TOKENS_PREFILL=$((MORI_MAX_DISPATCH_TOKENS_PREFILL * prefill_dp_ranks / 2)) + echo "[DP+EP override] Prefill: max-running-requests=$prefill_max_running_requests, MOE_MAX_INPUT=$MORI_MOE_MAX_INPUT_TOKENS_PREFILL" + else + unset MORI_MOE_MAX_INPUT_TOKENS_PREFILL + fi fi # Compute DP-dependent decode parameters (3-way: DP > EP-only > no_dp) @@ -214,7 +218,11 @@ if [[ "$DECODE_ENABLE_DP" == "true" ]] && [[ "$DECODE_ENABLE_EP" == "true" ]]; t decode_max_running_requests=$BENCH_MAX_CONC_VALUE decode_dp_ranks=$DECODE_TP_SIZE MORI_MAX_DISPATCH_TOKENS_DECODE=$((BENCH_MAX_CONC_VALUE / decode_dp_ranks)) - MORI_MOE_MAX_INPUT_TOKENS_DECODE=$((MORI_MAX_DISPATCH_TOKENS_DECODE * decode_dp_ranks * 7 / 10)) + if [[ "decode_max_running_requests" -gt 128 ]]; + MORI_MOE_MAX_INPUT_TOKENS_DECODE=$((MORI_MAX_DISPATCH_TOKENS_DECODE * decode_dp_ranks * 7 / 10)) + else + unset MORI_MOE_MAX_INPUT_TOKENS_DECODE + fi # Update derived variable SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD=$((MORI_MAX_DISPATCH_TOKENS_DECODE * 2)) export SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD