SemiAnalysisAI · functionstackx · May 29, 2026 · Jun 2, 2026 · Jun 2, 2026 · Jun 2, 2026
@@ -1519,7 +1519,9 @@ dsr1-fp8-mi355x-sglang-disagg-mtp:
           - "DECODE_MTP_SIZE=2"
 
 kimik2.5-fp4-mi355x-vllm-disagg:
-  image: vllm/vllm-openai-rocm:nightly-bf610c2f56764e1b30bc6065f4ceace3d6e59036
+  # TODO(simondanielsson): change to pinned version once https://github.com/vllm-project/vllm/pull/40344
+  # is part of official release, likely 0.22.0.
+  image: vllm/vllm-openai-rocm:nightly-3f0a91bb96f8d72e0498b95c166e817deae14d62
   model: amd/Kimi-K2.5-MXFP4
   model-prefix: kimik2.5
   runner: mi355x-disagg
@@ -1542,7 +1544,6 @@ kimik2.5-fp4-mi355x-vllm-disagg:
           dp-attn: false
           additional-settings:
           - "PREFILL_NODES=1"
-          - "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
         decode:
           num-worker: 2
           tp: 8
@@ -1563,7 +1564,6 @@ kimik2.5-fp4-mi355x-vllm-disagg:
           dp-attn: false
           additional-settings:
           - "PREFILL_NODES=1"
-          - "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
         decode:
           num-worker: 2
           tp: 8
@@ -1573,7 +1573,9 @@ kimik2.5-fp4-mi355x-vllm-disagg:
           - "DECODE_NODES=2"
 
 minimaxm2.5-fp8-mi355x-vllm-disagg:
-  image: vllm/vllm-openai-rocm:nightly-a6682d1d259cca69a9ae737ea5608fbbe7520031
+  # TODO(simondanielsson): change to pinned version once https://github.com/vllm-project/vllm/pull/40344
+  # is part of official release, likely 0.22.0.
+  image: vllm/vllm-openai-rocm:nightly-3f0a91bb96f8d72e0498b95c166e817deae14d62
   model: MiniMaxAI/MiniMax-M2.5
   model-prefix: minimaxm2.5
   runner: mi355x-disagg
@@ -1598,7 +1600,6 @@ minimaxm2.5-fp8-mi355x-vllm-disagg:
           dp-attn: false
           additional-settings:
           - "PREFILL_NODES=1"
-          - "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
         decode:
           num-worker: 2
           tp: 8
@@ -1619,7 +1620,6 @@ minimaxm2.5-fp8-mi355x-vllm-disagg:
           dp-attn: false
           additional-settings:
           - "PREFILL_NODES=1"
-          - "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
         decode:
           num-worker: 2
           tp: 8
@@ -1628,6 +1628,68 @@ minimaxm2.5-fp8-mi355x-vllm-disagg:
           additional-settings:
           - "DECODE_NODES=2"
 
+# DeepSeek-V4-Pro disaggregated prefill/decode on MI355X via vLLM + MoRI-IO.
+# Combines the validated single-node DSv4 vLLM serving recipe
+# (dsv4-fp4-mi355x-vllm, vllm-project/recipes#433) with the vLLM-disagg framework
+# (the kimi / minimax vllm-disagg recipes above). Routes to
+# benchmarks/multi_node/dsv4_fp4_mi355x_vllm-disagg.sh; per-node serving flags +
+# env live in the DeepSeek-V4-Pro entry of amd_utils/models_vllm.yaml.
+#
+# DeepSeek-V4-Pro is FP4+FP8 mixed (FP4 MoE expert weights, FP8 for the rest);
+# InferenceX classifies this as fp4 — same as the dsv4-fp4-mi355x-{sglang,vllm,atom}
+# entries below.
+#
+# Image: the same vLLM ROCm nightly the kimi / minimax vllm-disagg recipes use
+# (#1585). This nightly carries the upstreamed MoRI-IO fixes (vllm#40344, merged
+# 2026-05-28), which let setup_deps.sh drop the large runtime MoRIIO patches and
+# use the native read_mode flag (set by server_vllm.sh in kv_connector_extra_config)
+# instead of the VLLM_MORIIO_CONNECTOR_READ_MODE env var. #40344 is NOT in the
+# v0.22.0 / v0.22.1 release tags (it landed ~1 day before the v0.22.0 cut and was
+# not backported), so the release tags can't be used with the patch-free path —
+# hence the nightly. It also postdates the DSv4 model class (vllm#40871,
+# 2026-05-05) and the MoRIIO connector (vllm#29304), so it carries both.
+#
+# Topology 1P1D: each prefill/decode worker is a full TP=8 node (EP=1), matching
+# the aggregated recipe which runs DSv4 on TP=8 without expert parallelism (so no
+# all2all backend is needed — the mori_low_latency a2a rename from #1585 applies
+# only to the EP decode of kimi/minimax). DEP decode + multi-node 1P2D are
+# follow-ups once the base disagg path validates.
+dsv4-fp4-mi355x-vllm-disagg:
+  image: vllm/vllm-openai-rocm:nightly-3f0a91bb96f8d72e0498b95c166e817deae14d62
+  model: deepseek-ai/DeepSeek-V4-Pro
+  model-prefix: dsv4
+  runner: mi355x-disagg
+  precision: fp4
+  framework: vllm-disagg
+  multinode: true
+  disagg: true
+  scenarios:
+    fixed-seq-len:
+    # Smoke test: a single ISL/OSL (8k/1k) at a single concurrency (conc=1) to
+    # validate the DSv4 vLLM-disagg path (image, MoRIIO transport, serving flags,
+    # model staging) end-to-end before expanding to the full 1k1k + 8k1k,
+    # conc 8-512 sweep that the kimi / minimax vllm-disagg recipes run.
+    - isl: 8192
+      osl: 1024
+      search-space:
+      # 1P1D: 1 prefill node (co-located with proxy) + 1 decode node = 2 nodes total
+      - spec-decoding: "none"
+        conc-list: [ 1 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
+
 dsr1-fp4-mi355x-sglang-disagg:
   image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260519
   model: amd/DeepSeek-R1-0528-MXFP4-v2

diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm
@@ -316,7 +316,7 @@ SANITIZED_USER=$(echo "$USER_NAME" | tr -c 'a-zA-Z0-9_.-' '_')
 export DOCKER_CONT_NAME="container_${ENGINE}_${SANITIZED_USER}_${MODEL_NAME}_${SLURM_JOB_ID}"
 
 # vLLM external router container
-VLLM_ROUTER_IMAGE="${VLLM_ROUTER_IMAGE:-vllm/vllm-router:nightly-20260511-e667ebb}"
+VLLM_ROUTER_IMAGE="${VLLM_ROUTER_IMAGE:-vllm/vllm-router:nightly-20260603-e667ebb}"
 ROUTER_CONT_NAME="router_vllm_${SANITIZED_USER}_${SLURM_JOB_ID}"
 export RUN_FILE_FULL="$WS_PATH/${RUN_FILE}"
 
@@ -401,7 +401,6 @@ if [[ "$ENGINE" == "vllm-disagg" ]]; then
         -e UCX_LOG_LEVEL=warn
         -e HSA_ENABLE_SDMA=1
         -e PROXY_STREAM_IDLE_TIMEOUT=\${PROXY_STREAM_IDLE_TIMEOUT:-300}
-        -e VLLM_MORIIO_CONNECTOR_READ_MODE=\${VLLM_MORIIO_CONNECTOR_READ_MODE:-1}
         -e PYTHONPYCACHEPREFIX=/tmp/pycache
     )
 elif [[ "$ENGINE" == "atom-disagg" ]]; then

diff --git a/benchmarks/multi_node/amd_utils/models_vllm.yaml b/benchmarks/multi_node/amd_utils/models_vllm.yaml
@@ -26,19 +26,43 @@ amd-Llama-3.3-70B-Instruct-FP8-KV:
 
 Kimi-K2.5-MXFP4:
   prefill_flags: "--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1 --gpu-memory-utilization 0.90 --mm-encoder-tp-mode data"
-  decode_flags: "--tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1 --gpu-memory-utilization 0.90 --mm-encoder-tp-mode data"
+  decode_flags: "--tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori_low_latency --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1 --gpu-memory-utilization 0.90 --mm-encoder-tp-mode data"
   env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_PAGED_ATTN=0 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_SILU_MUL=0 VLLM_ENGINE_READY_TIMEOUT_S=3600"
   hf_dir: "models--amd--Kimi-K2.5-MXFP4"
 
 MiniMax-M2.5:
   # AITER fused-MoE kernel fmoe_bf16_blockscaleFp8_g1u1_vs_silu_32x384 for gfx950 writes OOB when run with MiniMax's shapes at M=8K(=num batched tokens), crashing vllm during AITER warmup.
   # Set token budget to 4k to avoid using that shape, instead of disabling AITER_MOE.
-  prefill_flags: "--max-num-batched-tokens 4K --tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --no-enable-prefix-caching --gpu-memory-utilization 0.95 --block-size 32"
-  decode_flags: "--max-num-batched-tokens 4K --tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --no-enable-prefix-caching --gpu-memory-utilization 0.95 --block-size 32"
+  prefill_flags: "--max-num-batched-tokens 4K --tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori_low_latency --no-enable-prefix-caching --gpu-memory-utilization 0.95 --block-size 32"
+  decode_flags: "--max-num-batched-tokens 4K --tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori_low_latency --no-enable-prefix-caching --gpu-memory-utilization 0.95 --block-size 32"
   env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 VLLM_ENGINE_READY_TIMEOUT_S=3600 VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1"
   hf_dir: "models--MiniMaxAI--MiniMax-M2.5"
 
 gpt-oss-120b:
   prefill_flags: "--tensor-parallel-size 8"
   decode_flags: "--tensor-parallel-size 8"
   env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 VLLM_USE_AITER_UNIFIED_ATTENTION=1 VLLM_ROCM_USE_AITER_MHA=0 ROCM_TRITON_MOE_PRESHUFFLE_SCALES=0"
+
+DeepSeek-V4-Pro:
+  # DeepSeek-V4-Pro is mixed-precision FP4+FP8 (FP4 MoE expert weights dominate
+  # the ~960 GB footprint; FP8 on attention/norm/router; FP8 KV cache at runtime).
+  # InferenceX classifies this as the fp4 variant.
+  #
+  # Serving flags reuse the validated single-node MI355X recipe
+  # (benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_vllm.sh, from
+  # vllm-project/recipes#433) so the per-node engine config is identical to the
+  # known-good aggregated run; disaggregation only adds the MoRIIO kv-transfer
+  # role (injected by server_vllm.sh). Each P/D worker is a full TP=8 node, EP=1
+  # — matching the aggregated recipe, which runs DSv4 on TP=8 without expert
+  # parallelism. DEP decode is a follow-up.
+  #
+  # --moe-backend triton_unfused is REQUIRED for the FP4 MoE expert weight format;
+  # the auto backend doesn't register the FP4 scale params and safetensors load
+  # raises KeyError. --enforce-eager (no CUDA graphs) keeps the first disagg recipe
+  # robust against cudagraph/MoRIIO-hook interactions; FULL/PIECEWISE capture is a
+  # follow-up. --async-scheduling is intentionally omitted (not used by the kimi /
+  # minimax vllm-disagg recipes).
+  prefill_flags: "--tensor-parallel-size 8 --distributed-executor-backend mp --kv-cache-dtype fp8 --moe-backend triton_unfused --tokenizer-mode deepseek_v4 --reasoning-parser deepseek_v4 --no-enable-prefix-caching --gpu-memory-utilization 0.9 --max-num-batched-tokens 8192 --enforce-eager"
+  decode_flags: "--tensor-parallel-size 8 --distributed-executor-backend mp --kv-cache-dtype fp8 --moe-backend triton_unfused --tokenizer-mode deepseek_v4 --reasoning-parser deepseek_v4 --no-enable-prefix-caching --gpu-memory-utilization 0.9 --max-num-batched-tokens 8192 --enforce-eager"
+  env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ENGINE_READY_TIMEOUT_S=3600"
+  hf_dir: "models--deepseek-ai--DeepSeek-V4-Pro"
diff --git a/benchmarks/multi_node/amd_utils/server_vllm.sh b/benchmarks/multi_node/amd_utils/server_vllm.sh
@@ -256,7 +256,7 @@ if [ "$NODE_RANK" -eq 0 ]; then
         --served-model-name ${SERVED_MODEL} \
         --port $SERVER_PORT \
         --trust-remote-code \
-        --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_producer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\"}}' \
+        --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_producer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\", \"read_mode\": true}}' \
         ${PREFILL_SERVER_CONFIG}"
 
     if [[ "$DRY_RUN" -eq 1 ]]; then
@@ -422,7 +422,7 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$xP" ]; then
         --served-model-name ${SERVED_MODEL} \
         --port $SERVER_PORT \
         --trust-remote-code \
-        --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_producer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\"}}' \
+        --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_producer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\", \"read_mode\": true}}' \
         ${PREFILL_SERVER_CONFIG}"
 
     if [[ "$DRY_RUN" -eq 1 ]]; then
@@ -478,7 +478,7 @@ else
         --served-model-name ${SERVED_MODEL} \
         --port $SERVER_PORT \
         --trust-remote-code \
-        --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_consumer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\"}}' \
+        --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_consumer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\", \"read_mode\": true}}' \
         ${DECODE_SERVER_CONFIG}"
 
     if [[ "$DRY_RUN" -eq 1 ]]; then