Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 68 additions & 6 deletions .github/configs/amd-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1519,7 +1519,9 @@ dsr1-fp8-mi355x-sglang-disagg-mtp:
- "DECODE_MTP_SIZE=2"

kimik2.5-fp4-mi355x-vllm-disagg:
image: vllm/vllm-openai-rocm:nightly-bf610c2f56764e1b30bc6065f4ceace3d6e59036
# TODO(simondanielsson): change to pinned version once https://github.com/vllm-project/vllm/pull/40344
# is part of official release, likely 0.22.0.
image: vllm/vllm-openai-rocm:nightly-3f0a91bb96f8d72e0498b95c166e817deae14d62
model: amd/Kimi-K2.5-MXFP4
model-prefix: kimik2.5
runner: mi355x-disagg
Expand All @@ -1542,7 +1544,6 @@ kimik2.5-fp4-mi355x-vllm-disagg:
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
- "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
decode:
num-worker: 2
tp: 8
Expand All @@ -1563,7 +1564,6 @@ kimik2.5-fp4-mi355x-vllm-disagg:
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
- "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
decode:
num-worker: 2
tp: 8
Expand All @@ -1573,7 +1573,9 @@ kimik2.5-fp4-mi355x-vllm-disagg:
- "DECODE_NODES=2"

minimaxm2.5-fp8-mi355x-vllm-disagg:
image: vllm/vllm-openai-rocm:nightly-a6682d1d259cca69a9ae737ea5608fbbe7520031
# TODO(simondanielsson): change to pinned version once https://github.com/vllm-project/vllm/pull/40344
# is part of official release, likely 0.22.0.
image: vllm/vllm-openai-rocm:nightly-3f0a91bb96f8d72e0498b95c166e817deae14d62
model: MiniMaxAI/MiniMax-M2.5
model-prefix: minimaxm2.5
runner: mi355x-disagg
Expand All @@ -1598,7 +1600,6 @@ minimaxm2.5-fp8-mi355x-vllm-disagg:
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
- "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
decode:
num-worker: 2
tp: 8
Expand All @@ -1619,7 +1620,6 @@ minimaxm2.5-fp8-mi355x-vllm-disagg:
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
- "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
decode:
num-worker: 2
tp: 8
Expand All @@ -1628,6 +1628,68 @@ minimaxm2.5-fp8-mi355x-vllm-disagg:
additional-settings:
- "DECODE_NODES=2"

# DeepSeek-V4-Pro disaggregated prefill/decode on MI355X via vLLM + MoRI-IO.
# Combines the validated single-node DSv4 vLLM serving recipe
# (dsv4-fp4-mi355x-vllm, vllm-project/recipes#433) with the vLLM-disagg framework
# (the kimi / minimax vllm-disagg recipes above). Routes to
# benchmarks/multi_node/dsv4_fp4_mi355x_vllm-disagg.sh; per-node serving flags +
# env live in the DeepSeek-V4-Pro entry of amd_utils/models_vllm.yaml.
#
# DeepSeek-V4-Pro is FP4+FP8 mixed (FP4 MoE expert weights, FP8 for the rest);
# InferenceX classifies this as fp4 — same as the dsv4-fp4-mi355x-{sglang,vllm,atom}
# entries below.
#
# Image: the same vLLM ROCm nightly the kimi / minimax vllm-disagg recipes use
# (#1585). This nightly carries the upstreamed MoRI-IO fixes (vllm#40344, merged
# 2026-05-28), which let setup_deps.sh drop the large runtime MoRIIO patches and
# use the native read_mode flag (set by server_vllm.sh in kv_connector_extra_config)
# instead of the VLLM_MORIIO_CONNECTOR_READ_MODE env var. #40344 is NOT in the
# v0.22.0 / v0.22.1 release tags (it landed ~1 day before the v0.22.0 cut and was
# not backported), so the release tags can't be used with the patch-free path —
# hence the nightly. It also postdates the DSv4 model class (vllm#40871,
# 2026-05-05) and the MoRIIO connector (vllm#29304), so it carries both.
#
# Topology 1P1D: each prefill/decode worker is a full TP=8 node (EP=1), matching
# the aggregated recipe which runs DSv4 on TP=8 without expert parallelism (so no
# all2all backend is needed — the mori_low_latency a2a rename from #1585 applies
# only to the EP decode of kimi/minimax). DEP decode + multi-node 1P2D are
# follow-ups once the base disagg path validates.
dsv4-fp4-mi355x-vllm-disagg:
image: vllm/vllm-openai-rocm:nightly-3f0a91bb96f8d72e0498b95c166e817deae14d62
model: deepseek-ai/DeepSeek-V4-Pro
model-prefix: dsv4
runner: mi355x-disagg
precision: fp4
framework: vllm-disagg
multinode: true
disagg: true
scenarios:
fixed-seq-len:
# Smoke test: a single ISL/OSL (8k/1k) at a single concurrency (conc=1) to
# validate the DSv4 vLLM-disagg path (image, MoRIIO transport, serving flags,
# model staging) end-to-end before expanding to the full 1k1k + 8k1k,
# conc 8-512 sweep that the kimi / minimax vllm-disagg recipes run.
- isl: 8192
osl: 1024
search-space:
# 1P1D: 1 prefill node (co-located with proxy) + 1 decode node = 2 nodes total
- spec-decoding: "none"
conc-list: [ 1 ]
prefill:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
decode:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "DECODE_NODES=1"

dsr1-fp4-mi355x-sglang-disagg:
image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260519
model: amd/DeepSeek-R1-0528-MXFP4-v2
Expand Down
3 changes: 1 addition & 2 deletions benchmarks/multi_node/amd_utils/job.slurm
Original file line number Diff line number Diff line change
Expand Up @@ -316,7 +316,7 @@ SANITIZED_USER=$(echo "$USER_NAME" | tr -c 'a-zA-Z0-9_.-' '_')
export DOCKER_CONT_NAME="container_${ENGINE}_${SANITIZED_USER}_${MODEL_NAME}_${SLURM_JOB_ID}"

# vLLM external router container
VLLM_ROUTER_IMAGE="${VLLM_ROUTER_IMAGE:-vllm/vllm-router:nightly-20260511-e667ebb}"
VLLM_ROUTER_IMAGE="${VLLM_ROUTER_IMAGE:-vllm/vllm-router:nightly-20260603-e667ebb}"
ROUTER_CONT_NAME="router_vllm_${SANITIZED_USER}_${SLURM_JOB_ID}"
export RUN_FILE_FULL="$WS_PATH/${RUN_FILE}"

Expand Down Expand Up @@ -401,7 +401,6 @@ if [[ "$ENGINE" == "vllm-disagg" ]]; then
-e UCX_LOG_LEVEL=warn
-e HSA_ENABLE_SDMA=1
-e PROXY_STREAM_IDLE_TIMEOUT=\${PROXY_STREAM_IDLE_TIMEOUT:-300}
-e VLLM_MORIIO_CONNECTOR_READ_MODE=\${VLLM_MORIIO_CONNECTOR_READ_MODE:-1}
-e PYTHONPYCACHEPREFIX=/tmp/pycache
)
elif [[ "$ENGINE" == "atom-disagg" ]]; then
Expand Down
30 changes: 27 additions & 3 deletions benchmarks/multi_node/amd_utils/models_vllm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,19 +26,43 @@ amd-Llama-3.3-70B-Instruct-FP8-KV:

Kimi-K2.5-MXFP4:
prefill_flags: "--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1 --gpu-memory-utilization 0.90 --mm-encoder-tp-mode data"
decode_flags: "--tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1 --gpu-memory-utilization 0.90 --mm-encoder-tp-mode data"
decode_flags: "--tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori_low_latency --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1 --gpu-memory-utilization 0.90 --mm-encoder-tp-mode data"
env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_PAGED_ATTN=0 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_SILU_MUL=0 VLLM_ENGINE_READY_TIMEOUT_S=3600"
hf_dir: "models--amd--Kimi-K2.5-MXFP4"

MiniMax-M2.5:
# AITER fused-MoE kernel fmoe_bf16_blockscaleFp8_g1u1_vs_silu_32x384 for gfx950 writes OOB when run with MiniMax's shapes at M=8K(=num batched tokens), crashing vllm during AITER warmup.
# Set token budget to 4k to avoid using that shape, instead of disabling AITER_MOE.
prefill_flags: "--max-num-batched-tokens 4K --tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --no-enable-prefix-caching --gpu-memory-utilization 0.95 --block-size 32"
decode_flags: "--max-num-batched-tokens 4K --tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --no-enable-prefix-caching --gpu-memory-utilization 0.95 --block-size 32"
prefill_flags: "--max-num-batched-tokens 4K --tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori_low_latency --no-enable-prefix-caching --gpu-memory-utilization 0.95 --block-size 32"
decode_flags: "--max-num-batched-tokens 4K --tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori_low_latency --no-enable-prefix-caching --gpu-memory-utilization 0.95 --block-size 32"
env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 VLLM_ENGINE_READY_TIMEOUT_S=3600 VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1"
hf_dir: "models--MiniMaxAI--MiniMax-M2.5"

gpt-oss-120b:
prefill_flags: "--tensor-parallel-size 8"
decode_flags: "--tensor-parallel-size 8"
env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 VLLM_USE_AITER_UNIFIED_ATTENTION=1 VLLM_ROCM_USE_AITER_MHA=0 ROCM_TRITON_MOE_PRESHUFFLE_SCALES=0"

DeepSeek-V4-Pro:
# DeepSeek-V4-Pro is mixed-precision FP4+FP8 (FP4 MoE expert weights dominate
# the ~960 GB footprint; FP8 on attention/norm/router; FP8 KV cache at runtime).
# InferenceX classifies this as the fp4 variant.
#
# Serving flags reuse the validated single-node MI355X recipe
# (benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_vllm.sh, from
# vllm-project/recipes#433) so the per-node engine config is identical to the
# known-good aggregated run; disaggregation only adds the MoRIIO kv-transfer
# role (injected by server_vllm.sh). Each P/D worker is a full TP=8 node, EP=1
# — matching the aggregated recipe, which runs DSv4 on TP=8 without expert
# parallelism. DEP decode is a follow-up.
#
# --moe-backend triton_unfused is REQUIRED for the FP4 MoE expert weight format;
# the auto backend doesn't register the FP4 scale params and safetensors load
# raises KeyError. --enforce-eager (no CUDA graphs) keeps the first disagg recipe
# robust against cudagraph/MoRIIO-hook interactions; FULL/PIECEWISE capture is a
# follow-up. --async-scheduling is intentionally omitted (not used by the kimi /
# minimax vllm-disagg recipes).
prefill_flags: "--tensor-parallel-size 8 --distributed-executor-backend mp --kv-cache-dtype fp8 --moe-backend triton_unfused --tokenizer-mode deepseek_v4 --reasoning-parser deepseek_v4 --no-enable-prefix-caching --gpu-memory-utilization 0.9 --max-num-batched-tokens 8192 --enforce-eager"
decode_flags: "--tensor-parallel-size 8 --distributed-executor-backend mp --kv-cache-dtype fp8 --moe-backend triton_unfused --tokenizer-mode deepseek_v4 --reasoning-parser deepseek_v4 --no-enable-prefix-caching --gpu-memory-utilization 0.9 --max-num-batched-tokens 8192 --enforce-eager"
env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ENGINE_READY_TIMEOUT_S=3600"
hf_dir: "models--deepseek-ai--DeepSeek-V4-Pro"
6 changes: 3 additions & 3 deletions benchmarks/multi_node/amd_utils/server_vllm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,7 @@ if [ "$NODE_RANK" -eq 0 ]; then
--served-model-name ${SERVED_MODEL} \
--port $SERVER_PORT \
--trust-remote-code \
--kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_producer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\"}}' \
--kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_producer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\", \"read_mode\": true}}' \
${PREFILL_SERVER_CONFIG}"

if [[ "$DRY_RUN" -eq 1 ]]; then
Expand Down Expand Up @@ -422,7 +422,7 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$xP" ]; then
--served-model-name ${SERVED_MODEL} \
--port $SERVER_PORT \
--trust-remote-code \
--kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_producer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\"}}' \
--kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_producer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\", \"read_mode\": true}}' \
${PREFILL_SERVER_CONFIG}"

if [[ "$DRY_RUN" -eq 1 ]]; then
Expand Down Expand Up @@ -478,7 +478,7 @@ else
--served-model-name ${SERVED_MODEL} \
--port $SERVER_PORT \
--trust-remote-code \
--kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_consumer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\"}}' \
--kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_consumer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\", \"read_mode\": true}}' \
${DECODE_SERVER_CONFIG}"

if [[ "$DRY_RUN" -eq 1 ]]; then
Expand Down
Loading
Loading