SemiAnalysisAI · Oseltamivir · Jun 9, 2026 · Jun 9, 2026 · Jun 9, 2026 · Jun 9, 2026
@@ -1967,7 +1967,7 @@ dsr1-fp4-mi355x-sglang-disagg-1k1k-mtp:
 
 
 dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp:
-  image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260529
+  image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260609
   model: amd/DeepSeek-R1-0528-MXFP4-v2
   model-prefix: dsr1
   runner: mi355x-disagg
@@ -1980,67 +1980,13 @@ dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp:
     - isl: 8192
       osl: 1024
       search-space:
-      # MTP configurations
-      # 1P1D pure TP8
-      - spec-decoding: "mtp"
-        conc-list: [ 1, 2, 4, 8 ]
-        prefill:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "PREFILL_NODES=1"
-        decode:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "DECODE_NODES=1"
-          - "DECODE_MTP_SIZE=3"
-
-      # 1P2D TP8
-      - spec-decoding: "mtp"
-        conc-list: [ 2, 4, 8, 16, 32 ]
-        prefill:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "PREFILL_NODES=1"
-        decode:
-          num-worker: 2
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "DECODE_NODES=2"
-          - "DECODE_MTP_SIZE=3"
-
-      # 1P2D TP8
-      - spec-decoding: "mtp"
-        conc-list: [ 32, 64 ]
-        prefill:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "PREFILL_NODES=1"
-        decode:
-          num-worker: 2
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "DECODE_NODES=2"
-          - "DECODE_MTP_SIZE=3"
-
-      # 1*DEP8 + 1*DEP8
+      # Per-concurrency DEP8 + 1*DEP8 (MTP3) ladder to validate the nightly's
+      # MoRI dispatch-buffer fix across sub-256 dispatch sizes. Each conc is its
+      # own entry (own server launch) so dispatch = max(conc-list)/TP*(MTP+1)
+      # varies per point: conc 64->32, 32->16, 16->8, 8->4 (all <256, the bug
+      # regime). conc<=4 omitted: floor(conc/8)*4 = 0 under main's formula.
       - spec-decoding: "mtp"
-        conc-list: [ 640, 512 ]
+        conc-list: [ 64 ]
         prefill:
           num-worker: 1
           tp: 8
@@ -2057,9 +2003,8 @@ dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp:
           - "DECODE_NODES=1"
           - "DECODE_MTP_SIZE=3"
 
-      # 1*DEP8 + 1*DEP8
       - spec-decoding: "mtp"
-        conc-list: [ 256 ]
+        conc-list: [ 32 ]
         prefill:
           num-worker: 1
           tp: 8
@@ -2076,10 +2021,8 @@ dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp:
           - "DECODE_NODES=1"
           - "DECODE_MTP_SIZE=3"
 
-
-      # 1*DEP8 + 1*DEP8
       - spec-decoding: "mtp"
-        conc-list: [ 128 ]
+        conc-list: [ 16 ]
         prefill:
           num-worker: 1
           tp: 8
@@ -2096,9 +2039,8 @@ dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp:
           - "DECODE_NODES=1"
           - "DECODE_MTP_SIZE=3"
 
-      # 1*DEP8 + 1*DEP8
       - spec-decoding: "mtp"
-        conc-list: [ 64 ]
+        conc-list: [ 8 ]
         prefill:
           num-worker: 1
           tp: 8
@@ -2115,25 +2057,6 @@ dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp:
           - "DECODE_NODES=1"
           - "DECODE_MTP_SIZE=3"
 
-      # 2*DEP8 + 1*DEP8
-      - spec-decoding: "mtp"
-        conc-list: [ 1024, 2048, 4096 ]
-        prefill:
-          num-worker: 2
-          tp: 8
-          ep: 8
-          dp-attn: true
-          additional-settings:
-          - "PREFILL_NODES=2"
-        decode:
-          num-worker: 1
-          tp: 8
-          ep: 8
-          dp-attn: true
-          additional-settings:
-          - "DECODE_NODES=1"
-          - "DECODE_MTP_SIZE=1"
-
 
 # DSv4-Pro FP4 on MI355X via SGLang. Uses a rocm720 mi35x image built off the
 # amd/deepseek_v4 branch in sgl-project/sglang; the SHA is encoded in the

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -3531,3 +3531,12 @@
     - "The Rust frontend replaces only the Python serving/API layer (HTTP, tokenization, scheduling glue, detokenization) and spawns the same Python EngineCore, so GPU kernels/attention/MoE GEMM/KV cache are untouched"
     - "A/B sweep (28 single-node points, 1k1k + 8k1k, TP 1/2/4) vs the Python-frontend baseline (run 26696260751): throughput Pareto-neutral (peak tok/s/GPU within <1.5%, frontiers coincident) and TPOT flat (+-0.5%); TTFT improves ~8% at 1k1k and ~22% at 8k1k (every point), the expected signature of lower frontend CPU latency before first token, scaling with input length"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1634
+
+- config-keys:
+    - dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp
+  description:
+    - "Bump SGLang ROCm image to nightly v0.5.12.post1-rocm720-mi35x-20260609, reported to carry the upstream MoRI EP dispatch-buffer fix (sgl-project/sglang#27194, ROCm/mori#356)"
+    - "Narrow the 8k1k MTP search space to a per-concurrency DEP8 + 1xDEP8 (MTP3) ladder: conc 64, 32, 16, 8 as separate entries. Each is its own server launch so the per-rank dispatch buffer num_max_dispatch_tokens_per_rank = max(CONC_LIST)/TP*(MTP+1) varies per point: conc 64->32, 32->16, 16->8, 8->4 (all <256, the regime that silently corrupted decode output on -20260529: decodes fine, acceptance length stays high, but gsm8k=0). conc<=4 omitted because floor(conc/8)*4 = 0 under main's formula (degenerate)"
+    - "Harness/env settings left at main values (no env clamp, no in-place patcher) so a green gsm8k at each point validates the nightly itself fixes the kernel at that dispatch size; pre-fix MI355X reference: dispatch=32->0.00, 64->0.00, >=256->0.94"
+    - "VALIDATION-ONLY harness changes (revert with the full search space): utils/matrix_logic/generate_sweep_configs.py mark_eval_entries now evals every eligible multinode entry per group (not just the highest-conc one) so all four points get a gsm8k pass, and MIN_EVAL_CONC lowered 16->8 so conc-8 (dispatch=4) is eval-eligible"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1696
diff --git a/utils/matrix_logic/generate_sweep_configs.py b/utils/matrix_logic/generate_sweep_configs.py
@@ -20,7 +20,10 @@
     "8k1k": (8192, 1024)
 }
 
-MIN_EVAL_CONC = 16
+# Lowered 16 -> 8 for the DEP8 MoRI dispatch-buffer validation sweep so the
+# conc-8 point (dispatch=4) is eval-eligible. Restore to 16 when the full
+# search space is restored.
+MIN_EVAL_CONC = 8
 
 # Reverse mapping for exp-name generation
 seq_len_itos = {v: k for k, v in seq_len_stoi.items()}
@@ -116,11 +119,16 @@ def _max_eval_conc(ie):
         mn_groups[key].append((i, entry))
 
     for entries in mn_groups.values():
-        best_idx, best_entry = max(entries, key=_max_eval_conc)
-        eval_indices.add(best_idx)
-        # Set eval-conc to median of eligible conc values to avoid OOM during eval
-        eval_concs = _eligible_eval_concs(best_entry)
-        mn_eval_conc[best_idx] = eval_concs[len(eval_concs) // 2]
+        # VALIDATION SWEEP: eval EVERY eligible entry in the group (each at its
+        # own concurrency), not just the highest-conc one, so the per-dispatch-
+        # size gsm8k ladder (conc 8/16/32/64 -> dispatch 4/8/16/32) is covered.
+        # Revert to `best_idx, best_entry = max(entries, key=_max_eval_conc)`
+        # (single eval per group) when the full search space is restored.
+        for idx, entry in entries:
+            eval_indices.add(idx)
+            # Set eval-conc to median of eligible conc values to avoid OOM during eval
+            eval_concs = _eligible_eval_concs(entry)
+            mn_eval_conc[idx] = eval_concs[len(eval_concs) // 2]
 
     # Mark the selected entries (skip agentic entries which don't support evals)
     for i, entry in enumerate(matrix_values):

diff --git a/utils/matrix_logic/test_generate_sweep_configs.py b/utils/matrix_logic/test_generate_sweep_configs.py
@@ -290,6 +290,11 @@ def test_multi_node_skips_groups_with_only_conc_below_min_conc(self):
 
     def test_multi_node_eval_conc_uses_only_conc_values_at_or_above_min_conc(self):
         """Multinode eval-conc should be chosen from conc values >= MIN_EVAL_CONC."""
+        # Build conc lists relative to the floor so this stays correct regardless
+        # of the MIN_EVAL_CONC value: `below` is excluded, eligible = {at, above}.
+        below = MIN_EVAL_CONC // 2
+        at = MIN_EVAL_CONC
+        above = MIN_EVAL_CONC * 2
         matrix_values = [
             {
                 "model": "deepseek-ai/DeepSeek-R1-0528",
@@ -311,7 +316,7 @@ def test_multi_node_eval_conc_uses_only_conc_values_at_or_above_min_conc(self):
                     "ep": 1,
                     "dp-attn": False,
                 },
-                "conc": [8, 16, 32],
+                "conc": [below, at, above],
             },
             {
                 "model": "deepseek-ai/DeepSeek-R1-0528",
@@ -333,14 +338,15 @@ def test_multi_node_eval_conc_uses_only_conc_values_at_or_above_min_conc(self):
                     "ep": 1,
                     "dp-attn": False,
                 },
-                "conc": [8],
+                "conc": [below],
             },
         ]
 
         result = mark_eval_entries(matrix_values)
 
         assert result[0]["run-eval"] is True
-        assert result[0]["eval-conc"] == 32
+        # eligible = sorted([at, above]); median (index len//2 == 1) == above
+        assert result[0]["eval-conc"] == above
         assert result[1]["run-eval"] is False
 
     def test_marks_highest_and_median_conc(self):