From fcd0be4f30e0575ee91bf573d907a5be85e0673c Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Tue, 9 Jun 2026 10:30:17 -0700
Subject: [PATCH 1/4] dsr1 disagg 8k1k mtp: bump image to nightly 20260609,
 narrow sweep to conc-64

Bump dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp to SGLang ROCm nightly
v0.5.12.post1-rocm720-mi35x-20260609 and collapse the 8k1k MTP search
space to the single conc-64 DEP8 + 1xDEP8 (MTP3) point so
max(CONC_LIST)=64 -> the decode server sizes the MoRI per-rank dispatch
buffer at 64/8*(MTP+1)=32 (<256), the regime that silently corrupted
output (gsm8k=0) on -20260529. Validates the upstream fix
(sgl-project/sglang#27194, ROCm/mori#356) reported in this nightly.
Harness/env-var settings left unchanged so the result is an honest test.
---
 .github/configs/amd-master.yaml | 137 +-------------------------------
 1 file changed, 1 insertion(+), 136 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 78fdffa9a..75c52a6b3 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -1967,7 +1967,7 @@ dsr1-fp4-mi355x-sglang-disagg-1k1k-mtp:
   
 
 dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp:
-  image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260529
+  image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260609
   model: amd/DeepSeek-R1-0528-MXFP4-v2
   model-prefix: dsr1
   runner: mi355x-disagg
@@ -1980,122 +1980,6 @@ dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp:
     - isl: 8192
       osl: 1024
       search-space:
-      # MTP configurations
-      # 1P1D pure TP8
-      - spec-decoding: "mtp"
-        conc-list: [ 1, 2, 4, 8 ]
-        prefill:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "PREFILL_NODES=1"
-        decode:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "DECODE_NODES=1"
-          - "DECODE_MTP_SIZE=3"
-
-      # 1P2D TP8
-      - spec-decoding: "mtp"
-        conc-list: [ 2, 4, 8, 16, 32 ]
-        prefill:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "PREFILL_NODES=1"
-        decode:
-          num-worker: 2
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "DECODE_NODES=2"
-          - "DECODE_MTP_SIZE=3"
-
-      # 1P2D TP8
-      - spec-decoding: "mtp"
-        conc-list: [ 32, 64 ]
-        prefill:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "PREFILL_NODES=1"
-        decode:
-          num-worker: 2
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "DECODE_NODES=2"
-          - "DECODE_MTP_SIZE=3"
-
-      # 1*DEP8 + 1*DEP8
-      - spec-decoding: "mtp"
-        conc-list: [ 640, 512 ]
-        prefill:
-          num-worker: 1
-          tp: 8
-          ep: 8
-          dp-attn: true
-          additional-settings:
-          - "PREFILL_NODES=1"
-        decode:
-          num-worker: 1
-          tp: 8
-          ep: 8
-          dp-attn: true
-          additional-settings:
-          - "DECODE_NODES=1"
-          - "DECODE_MTP_SIZE=3"
-
-      # 1*DEP8 + 1*DEP8
-      - spec-decoding: "mtp"
-        conc-list: [ 256 ]
-        prefill:
-          num-worker: 1
-          tp: 8
-          ep: 8
-          dp-attn: true
-          additional-settings:
-          - "PREFILL_NODES=1"
-        decode:
-          num-worker: 1
-          tp: 8
-          ep: 8
-          dp-attn: true
-          additional-settings:
-          - "DECODE_NODES=1"
-          - "DECODE_MTP_SIZE=3"
-
-
-      # 1*DEP8 + 1*DEP8
-      - spec-decoding: "mtp"
-        conc-list: [ 128 ]
-        prefill:
-          num-worker: 1
-          tp: 8
-          ep: 8
-          dp-attn: true
-          additional-settings:
-          - "PREFILL_NODES=1"
-        decode:
-          num-worker: 1
-          tp: 8
-          ep: 8
-          dp-attn: true
-          additional-settings:
-          - "DECODE_NODES=1"
-          - "DECODE_MTP_SIZE=3"
-
       # 1*DEP8 + 1*DEP8
       - spec-decoding: "mtp"
         conc-list: [ 64 ]
@@ -2115,25 +1999,6 @@ dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp:
           - "DECODE_NODES=1"
           - "DECODE_MTP_SIZE=3"
 
-      # 2*DEP8 + 1*DEP8
-      - spec-decoding: "mtp"
-        conc-list: [ 1024, 2048, 4096 ]
-        prefill:
-          num-worker: 2
-          tp: 8
-          ep: 8
-          dp-attn: true
-          additional-settings:
-          - "PREFILL_NODES=2"
-        decode:
-          num-worker: 1
-          tp: 8
-          ep: 8
-          dp-attn: true
-          additional-settings:
-          - "DECODE_NODES=1"
-          - "DECODE_MTP_SIZE=1"
-
 
 # DSv4-Pro FP4 on MI355X via SGLang. Uses a rocm720 mi35x image built off the
 # amd/deepseek_v4 branch in sgl-project/sglang; the SHA is encoded in the

From e316738b0fcb7f449481937b59b00223ffd4712e Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Tue, 9 Jun 2026 10:32:55 -0700
Subject: [PATCH 2/4] perf-changelog: record dsr1 8k1k mtp nightly 20260609 +
 conc-64 validation entry

---
 perf-changelog.yaml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 5622173f1..2c9619f5c 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3531,3 +3531,11 @@
     - "The Rust frontend replaces only the Python serving/API layer (HTTP, tokenization, scheduling glue, detokenization) and spawns the same Python EngineCore, so GPU kernels/attention/MoE GEMM/KV cache are untouched"
     - "A/B sweep (28 single-node points, 1k1k + 8k1k, TP 1/2/4) vs the Python-frontend baseline (run 26696260751): throughput Pareto-neutral (peak tok/s/GPU within <1.5%, frontiers coincident) and TPOT flat (+-0.5%); TTFT improves ~8% at 1k1k and ~22% at 8k1k (every point), the expected signature of lower frontend CPU latency before first token, scaling with input length"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1634
+
+- config-keys:
+    - dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp
+  description:
+    - "Bump SGLang ROCm image to nightly v0.5.12.post1-rocm720-mi35x-20260609, reported to carry the upstream MoRI EP dispatch-buffer fix (sgl-project/sglang#27194, ROCm/mori#356)"
+    - "Narrow the 8k1k MTP search space to the single conc-64 DEP8 + 1xDEP8 (MTP3) point. At conc-64/TP8/MTP3 the per-rank dispatch buffer num_max_dispatch_tokens_per_rank = max(CONC_LIST)/TP*(MTP+1) = 64/8*4 = 32 (<256), the regime that silently corrupted decode output on -20260529 (decodes fine, acceptance length stays high, but gsm8k=0)"
+    - "Harness/env settings left at main values (no env clamp, no in-place patcher) so a green conc-64 gsm8k validates the nightly itself fixes the kernel; pre-fix MI355X reference: dispatch=32->0.00, 64->0.00, >=256->0.94. Search space narrowed for this validation only -- restore the full sweep once confirmed"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1696

From ca5ab99226ccd5e65046b6f87dacfe996f8cad82 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Tue, 9 Jun 2026 10:45:05 -0700
Subject: [PATCH 3/4] dsr1 8k1k mtp: per-conc DEP8 eval ladder (64/32/16/8) for
 dispatch-bug validation

Expand the conc-64 point into separate DEP8+MTP3 entries for conc 64, 32,
16, 8 so each launches its own server and exercises a distinct sub-256
dispatch size (32/16/8/4). conc<=4 omitted (floor(conc/8)*4=0).

To get a gsm8k eval at every point (the harness otherwise evals only the
highest-conc entry per topology group, and ignores conc<16):
- mark_eval_entries: eval every eligible multinode entry per group, each
  at its own concurrency, instead of just max-conc.
- MIN_EVAL_CONC 16 -> 8 so conc-8 (dispatch=4) is eval-eligible.
Both are validation-only; revert with the full search-space restore.

Verified locally: generator emits 4 eval entries (conc 64/32/16/8, each
run-eval=true, eval-conc = own conc) and 4 benchmark entries.
---
 .github/configs/amd-master.yaml              | 60 +++++++++++++++++++-
 perf-changelog.yaml                          |  5 +-
 utils/matrix_logic/generate_sweep_configs.py | 20 +++++--
 3 files changed, 76 insertions(+), 9 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 75c52a6b3..e783a222f 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -1980,7 +1980,11 @@ dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp:
     - isl: 8192
       osl: 1024
       search-space:
-      # 1*DEP8 + 1*DEP8
+      # Per-concurrency DEP8 + 1*DEP8 (MTP3) ladder to validate the nightly's
+      # MoRI dispatch-buffer fix across sub-256 dispatch sizes. Each conc is its
+      # own entry (own server launch) so dispatch = max(conc-list)/TP*(MTP+1)
+      # varies per point: conc 64->32, 32->16, 16->8, 8->4 (all <256, the bug
+      # regime). conc<=4 omitted: floor(conc/8)*4 = 0 under main's formula.
       - spec-decoding: "mtp"
         conc-list: [ 64 ]
         prefill:
@@ -1999,6 +2003,60 @@ dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp:
           - "DECODE_NODES=1"
           - "DECODE_MTP_SIZE=3"
 
+      - spec-decoding: "mtp"
+        conc-list: [ 32 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=3"
+
+      - spec-decoding: "mtp"
+        conc-list: [ 16 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=3"
+
+      - spec-decoding: "mtp"
+        conc-list: [ 8 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=3"
+
 
 # DSv4-Pro FP4 on MI355X via SGLang. Uses a rocm720 mi35x image built off the
 # amd/deepseek_v4 branch in sgl-project/sglang; the SHA is encoded in the
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 2c9619f5c..b02d7be67 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3536,6 +3536,7 @@
     - dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp
   description:
     - "Bump SGLang ROCm image to nightly v0.5.12.post1-rocm720-mi35x-20260609, reported to carry the upstream MoRI EP dispatch-buffer fix (sgl-project/sglang#27194, ROCm/mori#356)"
-    - "Narrow the 8k1k MTP search space to the single conc-64 DEP8 + 1xDEP8 (MTP3) point. At conc-64/TP8/MTP3 the per-rank dispatch buffer num_max_dispatch_tokens_per_rank = max(CONC_LIST)/TP*(MTP+1) = 64/8*4 = 32 (<256), the regime that silently corrupted decode output on -20260529 (decodes fine, acceptance length stays high, but gsm8k=0)"
-    - "Harness/env settings left at main values (no env clamp, no in-place patcher) so a green conc-64 gsm8k validates the nightly itself fixes the kernel; pre-fix MI355X reference: dispatch=32->0.00, 64->0.00, >=256->0.94. Search space narrowed for this validation only -- restore the full sweep once confirmed"
+    - "Narrow the 8k1k MTP search space to a per-concurrency DEP8 + 1xDEP8 (MTP3) ladder: conc 64, 32, 16, 8 as separate entries. Each is its own server launch so the per-rank dispatch buffer num_max_dispatch_tokens_per_rank = max(CONC_LIST)/TP*(MTP+1) varies per point: conc 64->32, 32->16, 16->8, 8->4 (all <256, the regime that silently corrupted decode output on -20260529: decodes fine, acceptance length stays high, but gsm8k=0). conc<=4 omitted because floor(conc/8)*4 = 0 under main's formula (degenerate)"
+    - "Harness/env settings left at main values (no env clamp, no in-place patcher) so a green gsm8k at each point validates the nightly itself fixes the kernel at that dispatch size; pre-fix MI355X reference: dispatch=32->0.00, 64->0.00, >=256->0.94"
+    - "VALIDATION-ONLY harness changes (revert with the full search space): utils/matrix_logic/generate_sweep_configs.py mark_eval_entries now evals every eligible multinode entry per group (not just the highest-conc one) so all four points get a gsm8k pass, and MIN_EVAL_CONC lowered 16->8 so conc-8 (dispatch=4) is eval-eligible"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1696
diff --git a/utils/matrix_logic/generate_sweep_configs.py b/utils/matrix_logic/generate_sweep_configs.py
index 53efcca9f..a3d7fe2fd 100644
--- a/utils/matrix_logic/generate_sweep_configs.py
+++ b/utils/matrix_logic/generate_sweep_configs.py
@@ -20,7 +20,10 @@
     "8k1k": (8192, 1024)
 }
 
-MIN_EVAL_CONC = 16
+# Lowered 16 -> 8 for the DEP8 MoRI dispatch-buffer validation sweep so the
+# conc-8 point (dispatch=4) is eval-eligible. Restore to 16 when the full
+# search space is restored.
+MIN_EVAL_CONC = 8
 
 # Reverse mapping for exp-name generation
 seq_len_itos = {v: k for k, v in seq_len_stoi.items()}
@@ -116,11 +119,16 @@ def _max_eval_conc(ie):
         mn_groups[key].append((i, entry))
 
     for entries in mn_groups.values():
-        best_idx, best_entry = max(entries, key=_max_eval_conc)
-        eval_indices.add(best_idx)
-        # Set eval-conc to median of eligible conc values to avoid OOM during eval
-        eval_concs = _eligible_eval_concs(best_entry)
-        mn_eval_conc[best_idx] = eval_concs[len(eval_concs) // 2]
+        # VALIDATION SWEEP: eval EVERY eligible entry in the group (each at its
+        # own concurrency), not just the highest-conc one, so the per-dispatch-
+        # size gsm8k ladder (conc 8/16/32/64 -> dispatch 4/8/16/32) is covered.
+        # Revert to `best_idx, best_entry = max(entries, key=_max_eval_conc)`
+        # (single eval per group) when the full search space is restored.
+        for idx, entry in entries:
+            eval_indices.add(idx)
+            # Set eval-conc to median of eligible conc values to avoid OOM during eval
+            eval_concs = _eligible_eval_concs(entry)
+            mn_eval_conc[idx] = eval_concs[len(eval_concs) // 2]
 
     # Mark the selected entries (skip agentic entries which don't support evals)
     for i, entry in enumerate(matrix_values):

From 38370f43f510ddfba0048d708bb86584e94ea75a Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Tue, 9 Jun 2026 10:49:09 -0700
Subject: [PATCH 4/4] test: make multinode eval-conc floor test relative to
 MIN_EVAL_CONC

The test hardcoded conc [8,16,32]/[8] and expected eval-conc=32, which
broke when MIN_EVAL_CONC was lowered 16->8 (eligible median shifted to
16). Rebuild the conc lists from MIN_EVAL_CONC (below/at/above) so the
test asserts the floor behavior for any value of the constant -- passes
under both 8 and 16.
---
 utils/matrix_logic/test_generate_sweep_configs.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/utils/matrix_logic/test_generate_sweep_configs.py b/utils/matrix_logic/test_generate_sweep_configs.py
index 9bb473896..a4d133361 100644
--- a/utils/matrix_logic/test_generate_sweep_configs.py
+++ b/utils/matrix_logic/test_generate_sweep_configs.py
@@ -290,6 +290,11 @@ def test_multi_node_skips_groups_with_only_conc_below_min_conc(self):
 
     def test_multi_node_eval_conc_uses_only_conc_values_at_or_above_min_conc(self):
         """Multinode eval-conc should be chosen from conc values >= MIN_EVAL_CONC."""
+        # Build conc lists relative to the floor so this stays correct regardless
+        # of the MIN_EVAL_CONC value: `below` is excluded, eligible = {at, above}.
+        below = MIN_EVAL_CONC // 2
+        at = MIN_EVAL_CONC
+        above = MIN_EVAL_CONC * 2
         matrix_values = [
             {
                 "model": "deepseek-ai/DeepSeek-R1-0528",
@@ -311,7 +316,7 @@ def test_multi_node_eval_conc_uses_only_conc_values_at_or_above_min_conc(self):
                     "ep": 1,
                     "dp-attn": False,
                 },
-                "conc": [8, 16, 32],
+                "conc": [below, at, above],
             },
             {
                 "model": "deepseek-ai/DeepSeek-R1-0528",
@@ -333,14 +338,15 @@ def test_multi_node_eval_conc_uses_only_conc_values_at_or_above_min_conc(self):
                     "ep": 1,
                     "dp-attn": False,
                 },
-                "conc": [8],
+                "conc": [below],
             },
         ]
 
         result = mark_eval_entries(matrix_values)
 
         assert result[0]["run-eval"] is True
-        assert result[0]["eval-conc"] == 32
+        # eligible = sorted([at, above]); median (index len//2 == 1) == above
+        assert result[0]["eval-conc"] == above
         assert result[1]["run-eval"] is False
 
     def test_marks_highest_and_median_conc(self):