SemiAnalysisAI · functionstackx · Jun 11, 2026 · Jun 11, 2026 · Jun 11, 2026 · Jun 11, 2026
@@ -2136,6 +2136,66 @@ dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp:
           - "DECODE_MTP_SIZE=1"
 
 
+# DeepSeek-V4-Pro FP4 PD-disaggregation on MI355X via SGLang + MoRI. Combines the
+# validated single-node DSv4 SGLang recipe (dsv4-fp4-mi355x-sglang below) with the
+# SGLang-disagg framework used by the dsr1 / qwen3.5 / glm5 mi355x recipes. Routes
+# to benchmarks/multi_node/dsv4_fp4_mi355x_sglang-disagg.sh; per-node serving flags
+# live in the DeepSeek-V4-Pro entry of amd_utils/models.yaml, the DSv4 FP4-experts
+# SGLANG_* env block in amd_utils/env.sh, and the config.json model_type patch in
+# amd_utils/setup_deps.sh (all gated on MODEL_NAME).
+#
+# Image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260610 — the mainline
+# ROCm nightly the validated 0610 single-node DSv4 recipe (PR #1701) runs on. It
+# carries DSv4 support (now on sglang main) AND is on the same -mi35x- image line
+# as the dsr1/qwen3.5/glm5 disagg recipes, so it has the MoRI disaggregation
+# transfer backend — unlike the rocm/sgl-dev:*-DSv4 branch image the aggregated
+# dsv4-fp4-mi355x-sglang entry uses (cut from amd/deepseek_v4, MoRI support
+# unverified). Mainline omits deep_gemm; env.sh hardcodes the #1701 fp8 wo_a /
+# topk-v2 fallbacks (matching that validated run) rather than a runtime detect.
+# The v0.5.12.post1 tag also auto-applies the MoRI conn.py overlay (job.slurm)
+# that fixes the wire format for hybrid/sparse-attention models.
+#
+# This smoke test is TP-only (EP1), so the DSv4 + MoRI-EP FP4 swiglu crash that
+# sglang#27855 fixes is not on the path; that monkey-patch is intentionally not
+# carried here and would be needed only when EP/DEP decode is enabled.
+#
+# Topology 1P1D, TP8/EP1, dp-attn false — the same conservative starting point the
+# qwen3.5 / glm5 sglang-disagg recipes launched with. Starts at a single ISL/OSL
+# (8k/1k) conc=1 as an end-to-end smoke test (does DSv4 + MoRI disagg come up and
+# transfer KV at all on this image) before expanding to the full conc / DEP sweep.
+dsv4-fp4-mi355x-sglang-disagg:
+  image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260610
+  model: deepseek-ai/DeepSeek-V4-Pro
+  model-prefix: dsv4
+  runner: mi355x-disagg
+  precision: fp4
+  framework: sglang-disagg
+  multinode: true
+  disagg: true
+  scenarios:
+    fixed-seq-len:
+    - isl: 8192
+      osl: 1024
+      search-space:
+      # 1P1D TP8/EP1: 1 prefill node (co-located with router) + 1 decode node
+      - spec-decoding: "none"
+        conc-list: [ 1 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=0"
+
 # DSv4-Pro FP4 on MI355X via SGLang. Uses a rocm720 mi35x image built off the
 # amd/deepseek_v4 branch in sgl-project/sglang; the SHA is encoded in the
 # image tag, so bumping sglang is just an image tag bump here. Sweeps

diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh
@@ -149,6 +149,41 @@ else
         export SAFETENSORS_FAST_GPU=1
     fi
 
+    # DeepSeek-V4-Pro (FP4 experts): the DSv4 SGLANG_* env block, copied verbatim
+    # from the validated 0610 single-node recipe (PR #1701, benchmarks/single_node/
+    # fixed_seq_len/dsv4_fp4_mi355x_sglang.sh). That PR realigned DSv4 to the
+    # mainline ...mi35x-20260610 image (now that DSv4 support is on sglang main):
+    # the dsv4 attention backend, unified_kv_triton FlashMLA, the aiter indexer
+    # (not tilelang), and the mainline fp8 wo_a / topk-v2 fallbacks hardcoded
+    # (SGLANG_OPT_FP8_WO_A_GEMM=false, SGLANG_OPT_USE_TOPK_V2=false) instead of a
+    # deep_gemm-presence detect. Branch-only FP4 MoE flags (SGLANG_DSV4_FP4_EXPERTS,
+    # SGLANG_FORCE_TRITON_MOE_FP8) are dropped — DSv4 main no longer needs them.
+    # Gated on MODEL_NAME so other models are unaffected.
+    if [[ "$MODEL_NAME" == "DeepSeek-V4-Pro" ]]; then
+        export SGLANG_DEFAULT_THINKING=1
+        export SGLANG_DSV4_REASONING_EFFORT=max
+        export SGLANG_OPT_DEEPGEMM_HC_PRENORM=false
+        export SGLANG_USE_AITER=1
+        export SGLANG_USE_ROCM700A=0
+        export SGLANG_OPT_USE_FUSED_COMPRESS=true
+        export SGLANG_HACK_FLASHMLA_BACKEND=unified_kv_triton
+        export SGLANG_OPT_FP8_WO_A_GEMM=false
+        export SGLANG_OPT_USE_JIT_INDEXER_METADATA=false
+        export SGLANG_OPT_USE_TOPK_V2=false
+        export SGLANG_OPT_USE_AITER_INDEXER=true
+        export SGLANG_OPT_USE_TILELANG_INDEXER=false
+        export SGLANG_OPT_USE_TILELANG_MHC_PRE=false
+        export SGLANG_OPT_USE_TILELANG_MHC_POST=false
+        export SGLANG_FP8_PAGED_MQA_LOGITS_TORCH=1
+        export SGLANG_OPT_USE_FUSED_COMPRESS_TRITON=true
+        export AITER_BF16_FP8_MOE_BOUND=0
+        export SGLANG_EAGER_INPUT_NO_COPY=true
+
+        # multi-stream
+        export SGLANG_OPT_USE_MULTI_STREAM_OVERLAP=false
+        export SGLANG_ROCM_USE_MULTI_STREAM=false
+    fi
+
     # Disable allocating memory in one pass
     export MORI_SHMEM_MODE=ISOLATION
 

diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml
@@ -349,3 +349,46 @@ DeepSeek-R1-0528-MXFP4-v2:
       max_running_requests: 128
       chunked_prefill_size: 262144
       cuda_graph_bs_range: "1-128"
+
+# DeepSeek-V4-Pro (FP4 experts) PD-disaggregation. Serving flags mirror the
+# validated 0610 single-node SGLang recipe (PR #1701, dsv4_fp4_mi355x_sglang.sh):
+# the dsv4 attention backend, SWA, page-size 256, the deepseekv4 tool-call /
+# deepseek-v4 reasoning parsers, the DSv4 thinking chat template, and
+# shared-experts-fusion disabled. The matching DSv4 SGLANG_* env block (#1701)
+# lives in env.sh, and the config.json model_type patch (deepseek_v4 ->
+# deepseek_v3) in setup_deps.sh, both gated on MODEL_NAME == DeepSeek-V4-Pro.
+# --context-length is pinned (model default is very long; would over-reserve KV);
+# 9472 covers the 8k/1k smoke point. kv-cache-dtype is left at the model default
+# (the #1701 single-node recipe sets none), unlike the fp8_e4m3 DeepSeek-R1
+# disagg entries. The prefill delayer (--enable-prefill-delayer) is intentionally
+# not used here.
+DeepSeek-V4-Pro:
+  base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --disaggregation-transfer-backend mori --attention-backend dsv4 --swa-full-tokens-ratio 0.15 --page-size 256 --disable-shared-experts-fusion --tool-call-parser deepseekv4 --reasoning-parser deepseek-v4 --context-length 9472 --chat-template /workspace/benchmarks/single_node/chat_templates/deepseek_v4_thinking.jinja"
+  mtp_flags: ""
+  dp_flags: "--moe-a2a-backend mori --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
+  prefill:
+    mem_fraction_static: 0.8
+    disable_radix_cache: true
+    dp:
+      max_running_requests: 24
+      chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_PREFILL * PREFILL_TP_SIZE"
+      cuda_graph_bs: "1 2 3"
+    no_dp:
+      max_running_requests: 32
+      chunked_prefill_size: 8192
+      cuda_graph_bs_range: "1-32"
+  decode:
+    mem_fraction_static: 0.85
+    prefill_round_robin_balance: true
+    dp:
+      max_running_requests: 4096
+      chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_DECODE * DECODE_TP_SIZE"
+      cuda_graph_bs_range: "1-160"
+    ep_only:
+      max_running_requests: 256
+      chunked_prefill_size: 262144
+      cuda_graph_bs_range: "1-256"
+    no_dp:
+      max_running_requests: 64
+      chunked_prefill_size: 8192
+      cuda_graph_bs_range: "1-64"
diff --git a/benchmarks/multi_node/amd_utils/setup_deps.sh b/benchmarks/multi_node/amd_utils/setup_deps.sh
@@ -735,6 +735,54 @@ install_transformers_glm5() {
     _SETUP_INSTALLED+=("transformers-glm5")
 }
 
+# ---------------------------------------------------------------------------
+# SGLang: DeepSeek-V4-Pro config.json model_type patch.
+#
+# Transformers in these images doesn't recognize the `deepseek_v4` model_type,
+# so AutoConfig.from_pretrained crashes before SGLang can dispatch. The
+# single-node DSv4 recipes patch the HF-cache config.json directly; for disagg
+# the weights live on shared NFS at $MODEL_DIR/$MODEL_NAME, so patch that
+# config.json instead. Set model_type -> deepseek_v3 (so AutoConfig succeeds)
+# while keeping architectures=['DeepseekV4ForCausalLM'] so SGLang still
+# dispatches to its native DSv4 model class.
+#
+# Idempotent (no-op once model_type is deepseek_v3) and crash-safe under the
+# concurrent multi-node start: writes a temp file in the same dir and os.replace()
+# (atomic same-filesystem rename), so a reader never sees a half-written config.
+# Only runs for MODEL_NAME == DeepSeek-V4-Pro.
+# ---------------------------------------------------------------------------
+patch_dsv4_config() {
+    if [[ "$MODEL_NAME" != "DeepSeek-V4-Pro" ]]; then
+        return 0
+    fi
+    local cfg="${MODEL_DIR}/${MODEL_NAME}/config.json"
+    if [[ ! -f "$cfg" ]]; then
+        echo "[SETUP] WARN: DSv4 config.json not found at $cfg; skipping model_type patch"
+        return 0
+    fi
+    python3 - "$cfg" <<'PYEOF'
+import json, os, sys, tempfile
+cfg = sys.argv[1]
+with open(cfg) as f:
+    config = json.load(f)
+if config.get("model_type") != "deepseek_v4":
+    print(f"[SETUP] DSv4 config.json already patched (model_type={config.get('model_type')!r})")
+    sys.exit(0)
+config["model_type"] = "deepseek_v3"
+d = os.path.dirname(cfg)
+fd, tmp = tempfile.mkstemp(dir=d, prefix=".config.json.", suffix=".tmp")
+try:
+    with os.fdopen(fd, "w") as f:
+        json.dump(config, f, indent=2)
+    os.replace(tmp, cfg)
+    print(f"[SETUP] Patched {cfg}: model_type deepseek_v4 -> deepseek_v3")
+except Exception:
+    os.path.exists(tmp) and os.remove(tmp)
+    raise
+PYEOF
+    _SETUP_INSTALLED+=("dsv4-config-model-type")
+}
+
 # =============================================================================
 # Run installers (engine-gated)
 # =============================================================================
@@ -759,6 +807,7 @@ if [[ "$ENGINE" == "vllm-disagg" ]]; then
 else
     patch_gluon_pa_mqa_logits_instr_shape
     install_transformers_glm5
+    patch_dsv4_config
 fi
 
 _SETUP_END=$(date +%s)

diff --git a/benchmarks/multi_node/dsv4_fp4_mi355x_sglang-disagg.sh b/benchmarks/multi_node/dsv4_fp4_mi355x_sglang-disagg.sh
@@ -0,0 +1,91 @@
+#!/usr/bin/env bash
+
+# DeepSeek-V4-Pro FP4 disaggregated prefill/decode on MI355X via SGLang + MoRI.
+# Thin, model-agnostic launcher (same shape as the dsr1 / qwen3.5 / glm5
+# sglang-disagg wrappers): all serving flags live in the DeepSeek-V4-Pro entry
+# of amd_utils/models.yaml, DSv4-specific env + the config.json model_type patch
+# live in amd_utils/env.sh + setup_deps.sh, and topology (P/D node counts, TP/EP)
+# comes from amd-master.yaml.
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+check_env_vars \
+    CONC_LIST \
+    ISL \
+    OSL \
+    IMAGE \
+    SPEC_DECODING \
+    MODEL_PATH \
+    PREFILL_NUM_WORKERS \
+    PREFILL_TP \
+    PREFILL_EP \
+    PREFILL_DP_ATTN \
+    DECODE_NUM_WORKERS \
+    DECODE_TP \
+    DECODE_EP \
+    DECODE_DP_ATTN \
+    PREFILL_NODES \
+    DECODE_NODES \
+    RANDOM_RANGE_RATIO \
+    FRAMEWORK
+
+if [[ -n "$SLURM_JOB_ID" ]]; then
+  echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+fi
+
+set -x
+
+# Use upstreamed multi_node scripts (no external clone needed)
+cd "$GITHUB_WORKSPACE/benchmarks/multi_node/amd_utils" || exit 1
+
+# Set up SGL launch script-specific environment variables
+export TIME_LIMIT="08:00:00"
+export MODEL_PATH=$MODEL_PATH
+export MODEL_NAME=$MODEL_NAME
+export CONTAINER_IMAGE=$IMAGE
+
+if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then
+export PREFILL_ENABLE_EP=false
+else
+export PREFILL_ENABLE_EP=true
+fi
+
+if [[ "$PREFILL_DP_ATTN" == "true" ]]; then
+export PREFILL_ENABLE_DP=true
+else
+export PREFILL_ENABLE_DP=false
+fi
+
+if [[ "${DECODE_EP:-1}" -eq 1 ]]; then
+export DECODE_ENABLE_EP=false
+else
+export DECODE_ENABLE_EP=true
+fi
+
+if [[ "$DECODE_DP_ATTN" == "true" ]]; then
+export DECODE_ENABLE_DP=true
+else
+export DECODE_ENABLE_DP=false
+fi
+
+# Launch jobs based on ISL/OSL
+# Replace ' ' in CONC_LIST with 'x' such that the concurrency list is represented
+# by a list of numbers delimited by 'x'. This is because of how the underlying launch script
+# expects the concurrencies.
+JOB_ID=$(bash ./submit.sh $PREFILL_NODES \
+    $PREFILL_NUM_WORKERS \
+    $DECODE_NODES \
+    $DECODE_NUM_WORKERS \
+    $ISL $OSL "${CONC_LIST// /x}" inf \
+    ${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \
+    ${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \
+    ${PREFILL_TP} ${DECODE_TP} \
+    ${RANDOM_RANGE_RATIO} \
+    ${NODE_LIST:-})
+
+if [[ $? -ne 0 ]]; then
+    echo "Failed to submit job" >&2
+    exit 1
+fi
+
+echo "$JOB_ID"
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -3547,3 +3547,13 @@
     - "MI355x DSR1-FP4: Include TP4 configurations for 8k1k"
     - "Expand the TP sweep (included TP=4) for 8k/1k configuration for conc=4 to 64"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1692
+
+- config-keys:
+    - dsv4-fp4-mi355x-sglang-disagg
+  description:
+    - "New recipe: DeepSeek-V4-Pro FP4 prefill/decode-disaggregated on MI355X via SGLang + MoRI. Combines the validated single-node DSv4 SGLang recipe with the sglang-disagg framework from the dsr1/qwen3.5/glm5 mi355x recipes"
+    - "New benchmarks/multi_node/dsv4_fp4_mi355x_sglang-disagg.sh launcher + DeepSeek-V4-Pro entry in amd_utils/models.yaml. Serving flags + env mirror the validated 0610 single-node recipe (PR #1701): dsv4 attention backend, SWA, page-size 256, deepseekv4/deepseek-v4 parsers, DSv4 thinking chat template, shared-experts-fusion off, context-length pinned, kv-cache-dtype at model default; prefill delayer not used"
+    - "DSv4 SGLANG_* env block from PR #1701 added to amd_utils/env.sh (unified_kv_triton FlashMLA, aiter indexer, mainline fp8 wo_a / topk-v2 fallbacks hardcoded, multi-stream off; branch-only SGLANG_DSV4_FP4_EXPERTS/FORCE_TRITON_MOE_FP8 dropped); idempotent atomic config.json model_type patch (deepseek_v4->deepseek_v3) added to amd_utils/setup_deps.sh, both gated on MODEL_NAME"
+    - "Image lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260610 (the validated PR #1701 0610 image; mainline, carries DSv4 support + MoRI disagg backend, auto-applies the MoRI conn.py overlay; rocm/sgl-dev:*-DSv4 branch image lacks both)"
+    - "1P1D TP8/EP1 dp-attn false. Starts at a single ISL/OSL (8k/1k) conc=1 as an end-to-end smoke test before expanding the conc / DEP sweep"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1708