From fcd0be4f30e0575ee91bf573d907a5be85e0673c Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Tue, 9 Jun 2026 10:30:17 -0700 Subject: [PATCH 1/4] dsr1 disagg 8k1k mtp: bump image to nightly 20260609, narrow sweep to conc-64 Bump dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp to SGLang ROCm nightly v0.5.12.post1-rocm720-mi35x-20260609 and collapse the 8k1k MTP search space to the single conc-64 DEP8 + 1xDEP8 (MTP3) point so max(CONC_LIST)=64 -> the decode server sizes the MoRI per-rank dispatch buffer at 64/8*(MTP+1)=32 (<256), the regime that silently corrupted output (gsm8k=0) on -20260529. Validates the upstream fix (sgl-project/sglang#27194, ROCm/mori#356) reported in this nightly. Harness/env-var settings left unchanged so the result is an honest test. --- .github/configs/amd-master.yaml | 137 +------------------------------- 1 file changed, 1 insertion(+), 136 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 78fdffa9a..75c52a6b3 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1967,7 +1967,7 @@ dsr1-fp4-mi355x-sglang-disagg-1k1k-mtp: dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp: - image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260529 + image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260609 model: amd/DeepSeek-R1-0528-MXFP4-v2 model-prefix: dsr1 runner: mi355x-disagg @@ -1980,122 +1980,6 @@ dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp: - isl: 8192 osl: 1024 search-space: - # MTP configurations - # 1P1D pure TP8 - - spec-decoding: "mtp" - conc-list: [ 1, 2, 4, 8 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=3" - - # 1P2D TP8 - - spec-decoding: "mtp" - conc-list: [ 2, 4, 8, 16, 32 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=3" - - # 1P2D TP8 - - spec-decoding: "mtp" - conc-list: [ 32, 64 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=3" - - # 1*DEP8 + 1*DEP8 - - spec-decoding: "mtp" - conc-list: [ 640, 512 ] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=3" - - # 1*DEP8 + 1*DEP8 - - spec-decoding: "mtp" - conc-list: [ 256 ] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=3" - - - # 1*DEP8 + 1*DEP8 - - spec-decoding: "mtp" - conc-list: [ 128 ] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=3" - # 1*DEP8 + 1*DEP8 - spec-decoding: "mtp" conc-list: [ 64 ] @@ -2115,25 +1999,6 @@ dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=3" - # 2*DEP8 + 1*DEP8 - - spec-decoding: "mtp" - conc-list: [ 1024, 2048, 4096 ] - prefill: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "PREFILL_NODES=2" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=1" - # DSv4-Pro FP4 on MI355X via SGLang. Uses a rocm720 mi35x image built off the # amd/deepseek_v4 branch in sgl-project/sglang; the SHA is encoded in the From e316738b0fcb7f449481937b59b00223ffd4712e Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Tue, 9 Jun 2026 10:32:55 -0700 Subject: [PATCH 2/4] perf-changelog: record dsr1 8k1k mtp nightly 20260609 + conc-64 validation entry --- perf-changelog.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 5622173f1..2c9619f5c 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3531,3 +3531,11 @@ - "The Rust frontend replaces only the Python serving/API layer (HTTP, tokenization, scheduling glue, detokenization) and spawns the same Python EngineCore, so GPU kernels/attention/MoE GEMM/KV cache are untouched" - "A/B sweep (28 single-node points, 1k1k + 8k1k, TP 1/2/4) vs the Python-frontend baseline (run 26696260751): throughput Pareto-neutral (peak tok/s/GPU within <1.5%, frontiers coincident) and TPOT flat (+-0.5%); TTFT improves ~8% at 1k1k and ~22% at 8k1k (every point), the expected signature of lower frontend CPU latency before first token, scaling with input length" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1634 + +- config-keys: + - dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp + description: + - "Bump SGLang ROCm image to nightly v0.5.12.post1-rocm720-mi35x-20260609, reported to carry the upstream MoRI EP dispatch-buffer fix (sgl-project/sglang#27194, ROCm/mori#356)" + - "Narrow the 8k1k MTP search space to the single conc-64 DEP8 + 1xDEP8 (MTP3) point. At conc-64/TP8/MTP3 the per-rank dispatch buffer num_max_dispatch_tokens_per_rank = max(CONC_LIST)/TP*(MTP+1) = 64/8*4 = 32 (<256), the regime that silently corrupted decode output on -20260529 (decodes fine, acceptance length stays high, but gsm8k=0)" + - "Harness/env settings left at main values (no env clamp, no in-place patcher) so a green conc-64 gsm8k validates the nightly itself fixes the kernel; pre-fix MI355X reference: dispatch=32->0.00, 64->0.00, >=256->0.94. Search space narrowed for this validation only -- restore the full sweep once confirmed" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1696 From ca5ab99226ccd5e65046b6f87dacfe996f8cad82 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Tue, 9 Jun 2026 10:45:05 -0700 Subject: [PATCH 3/4] dsr1 8k1k mtp: per-conc DEP8 eval ladder (64/32/16/8) for dispatch-bug validation Expand the conc-64 point into separate DEP8+MTP3 entries for conc 64, 32, 16, 8 so each launches its own server and exercises a distinct sub-256 dispatch size (32/16/8/4). conc<=4 omitted (floor(conc/8)*4=0). To get a gsm8k eval at every point (the harness otherwise evals only the highest-conc entry per topology group, and ignores conc<16): - mark_eval_entries: eval every eligible multinode entry per group, each at its own concurrency, instead of just max-conc. - MIN_EVAL_CONC 16 -> 8 so conc-8 (dispatch=4) is eval-eligible. Both are validation-only; revert with the full search-space restore. Verified locally: generator emits 4 eval entries (conc 64/32/16/8, each run-eval=true, eval-conc = own conc) and 4 benchmark entries. --- .github/configs/amd-master.yaml | 60 +++++++++++++++++++- perf-changelog.yaml | 5 +- utils/matrix_logic/generate_sweep_configs.py | 20 +++++-- 3 files changed, 76 insertions(+), 9 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 75c52a6b3..e783a222f 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1980,7 +1980,11 @@ dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp: - isl: 8192 osl: 1024 search-space: - # 1*DEP8 + 1*DEP8 + # Per-concurrency DEP8 + 1*DEP8 (MTP3) ladder to validate the nightly's + # MoRI dispatch-buffer fix across sub-256 dispatch sizes. Each conc is its + # own entry (own server launch) so dispatch = max(conc-list)/TP*(MTP+1) + # varies per point: conc 64->32, 32->16, 16->8, 8->4 (all <256, the bug + # regime). conc<=4 omitted: floor(conc/8)*4 = 0 under main's formula. - spec-decoding: "mtp" conc-list: [ 64 ] prefill: @@ -1999,6 +2003,60 @@ dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=3" + - spec-decoding: "mtp" + conc-list: [ 32 ] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=3" + + - spec-decoding: "mtp" + conc-list: [ 16 ] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=3" + + - spec-decoding: "mtp" + conc-list: [ 8 ] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=3" + # DSv4-Pro FP4 on MI355X via SGLang. Uses a rocm720 mi35x image built off the # amd/deepseek_v4 branch in sgl-project/sglang; the SHA is encoded in the diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 2c9619f5c..b02d7be67 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3536,6 +3536,7 @@ - dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp description: - "Bump SGLang ROCm image to nightly v0.5.12.post1-rocm720-mi35x-20260609, reported to carry the upstream MoRI EP dispatch-buffer fix (sgl-project/sglang#27194, ROCm/mori#356)" - - "Narrow the 8k1k MTP search space to the single conc-64 DEP8 + 1xDEP8 (MTP3) point. At conc-64/TP8/MTP3 the per-rank dispatch buffer num_max_dispatch_tokens_per_rank = max(CONC_LIST)/TP*(MTP+1) = 64/8*4 = 32 (<256), the regime that silently corrupted decode output on -20260529 (decodes fine, acceptance length stays high, but gsm8k=0)" - - "Harness/env settings left at main values (no env clamp, no in-place patcher) so a green conc-64 gsm8k validates the nightly itself fixes the kernel; pre-fix MI355X reference: dispatch=32->0.00, 64->0.00, >=256->0.94. Search space narrowed for this validation only -- restore the full sweep once confirmed" + - "Narrow the 8k1k MTP search space to a per-concurrency DEP8 + 1xDEP8 (MTP3) ladder: conc 64, 32, 16, 8 as separate entries. Each is its own server launch so the per-rank dispatch buffer num_max_dispatch_tokens_per_rank = max(CONC_LIST)/TP*(MTP+1) varies per point: conc 64->32, 32->16, 16->8, 8->4 (all <256, the regime that silently corrupted decode output on -20260529: decodes fine, acceptance length stays high, but gsm8k=0). conc<=4 omitted because floor(conc/8)*4 = 0 under main's formula (degenerate)" + - "Harness/env settings left at main values (no env clamp, no in-place patcher) so a green gsm8k at each point validates the nightly itself fixes the kernel at that dispatch size; pre-fix MI355X reference: dispatch=32->0.00, 64->0.00, >=256->0.94" + - "VALIDATION-ONLY harness changes (revert with the full search space): utils/matrix_logic/generate_sweep_configs.py mark_eval_entries now evals every eligible multinode entry per group (not just the highest-conc one) so all four points get a gsm8k pass, and MIN_EVAL_CONC lowered 16->8 so conc-8 (dispatch=4) is eval-eligible" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1696 diff --git a/utils/matrix_logic/generate_sweep_configs.py b/utils/matrix_logic/generate_sweep_configs.py index 53efcca9f..a3d7fe2fd 100644 --- a/utils/matrix_logic/generate_sweep_configs.py +++ b/utils/matrix_logic/generate_sweep_configs.py @@ -20,7 +20,10 @@ "8k1k": (8192, 1024) } -MIN_EVAL_CONC = 16 +# Lowered 16 -> 8 for the DEP8 MoRI dispatch-buffer validation sweep so the +# conc-8 point (dispatch=4) is eval-eligible. Restore to 16 when the full +# search space is restored. +MIN_EVAL_CONC = 8 # Reverse mapping for exp-name generation seq_len_itos = {v: k for k, v in seq_len_stoi.items()} @@ -116,11 +119,16 @@ def _max_eval_conc(ie): mn_groups[key].append((i, entry)) for entries in mn_groups.values(): - best_idx, best_entry = max(entries, key=_max_eval_conc) - eval_indices.add(best_idx) - # Set eval-conc to median of eligible conc values to avoid OOM during eval - eval_concs = _eligible_eval_concs(best_entry) - mn_eval_conc[best_idx] = eval_concs[len(eval_concs) // 2] + # VALIDATION SWEEP: eval EVERY eligible entry in the group (each at its + # own concurrency), not just the highest-conc one, so the per-dispatch- + # size gsm8k ladder (conc 8/16/32/64 -> dispatch 4/8/16/32) is covered. + # Revert to `best_idx, best_entry = max(entries, key=_max_eval_conc)` + # (single eval per group) when the full search space is restored. + for idx, entry in entries: + eval_indices.add(idx) + # Set eval-conc to median of eligible conc values to avoid OOM during eval + eval_concs = _eligible_eval_concs(entry) + mn_eval_conc[idx] = eval_concs[len(eval_concs) // 2] # Mark the selected entries (skip agentic entries which don't support evals) for i, entry in enumerate(matrix_values): From 38370f43f510ddfba0048d708bb86584e94ea75a Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Tue, 9 Jun 2026 10:49:09 -0700 Subject: [PATCH 4/4] test: make multinode eval-conc floor test relative to MIN_EVAL_CONC The test hardcoded conc [8,16,32]/[8] and expected eval-conc=32, which broke when MIN_EVAL_CONC was lowered 16->8 (eligible median shifted to 16). Rebuild the conc lists from MIN_EVAL_CONC (below/at/above) so the test asserts the floor behavior for any value of the constant -- passes under both 8 and 16. --- utils/matrix_logic/test_generate_sweep_configs.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/utils/matrix_logic/test_generate_sweep_configs.py b/utils/matrix_logic/test_generate_sweep_configs.py index 9bb473896..a4d133361 100644 --- a/utils/matrix_logic/test_generate_sweep_configs.py +++ b/utils/matrix_logic/test_generate_sweep_configs.py @@ -290,6 +290,11 @@ def test_multi_node_skips_groups_with_only_conc_below_min_conc(self): def test_multi_node_eval_conc_uses_only_conc_values_at_or_above_min_conc(self): """Multinode eval-conc should be chosen from conc values >= MIN_EVAL_CONC.""" + # Build conc lists relative to the floor so this stays correct regardless + # of the MIN_EVAL_CONC value: `below` is excluded, eligible = {at, above}. + below = MIN_EVAL_CONC // 2 + at = MIN_EVAL_CONC + above = MIN_EVAL_CONC * 2 matrix_values = [ { "model": "deepseek-ai/DeepSeek-R1-0528", @@ -311,7 +316,7 @@ def test_multi_node_eval_conc_uses_only_conc_values_at_or_above_min_conc(self): "ep": 1, "dp-attn": False, }, - "conc": [8, 16, 32], + "conc": [below, at, above], }, { "model": "deepseek-ai/DeepSeek-R1-0528", @@ -333,14 +338,15 @@ def test_multi_node_eval_conc_uses_only_conc_values_at_or_above_min_conc(self): "ep": 1, "dp-attn": False, }, - "conc": [8], + "conc": [below], }, ] result = mark_eval_entries(matrix_values) assert result[0]["run-eval"] is True - assert result[0]["eval-conc"] == 32 + # eligible = sorted([at, above]); median (index len//2 == 1) == above + assert result[0]["eval-conc"] == above assert result[1]["run-eval"] is False def test_marks_highest_and_median_conc(self):