diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 0e548e611..f3e0c6b81 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2136,6 +2136,66 @@ dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp: - "DECODE_MTP_SIZE=1" +# DeepSeek-V4-Pro FP4 PD-disaggregation on MI355X via SGLang + MoRI. Combines the +# validated single-node DSv4 SGLang recipe (dsv4-fp4-mi355x-sglang below) with the +# SGLang-disagg framework used by the dsr1 / qwen3.5 / glm5 mi355x recipes. Routes +# to benchmarks/multi_node/dsv4_fp4_mi355x_sglang-disagg.sh; per-node serving flags +# live in the DeepSeek-V4-Pro entry of amd_utils/models.yaml, the DSv4 FP4-experts +# SGLANG_* env block in amd_utils/env.sh, and the config.json model_type patch in +# amd_utils/setup_deps.sh (all gated on MODEL_NAME). +# +# Image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260610 — the mainline +# ROCm nightly the validated 0610 single-node DSv4 recipe (PR #1701) runs on. It +# carries DSv4 support (now on sglang main) AND is on the same -mi35x- image line +# as the dsr1/qwen3.5/glm5 disagg recipes, so it has the MoRI disaggregation +# transfer backend — unlike the rocm/sgl-dev:*-DSv4 branch image the aggregated +# dsv4-fp4-mi355x-sglang entry uses (cut from amd/deepseek_v4, MoRI support +# unverified). Mainline omits deep_gemm; env.sh hardcodes the #1701 fp8 wo_a / +# topk-v2 fallbacks (matching that validated run) rather than a runtime detect. +# The v0.5.12.post1 tag also auto-applies the MoRI conn.py overlay (job.slurm) +# that fixes the wire format for hybrid/sparse-attention models. +# +# This smoke test is TP-only (EP1), so the DSv4 + MoRI-EP FP4 swiglu crash that +# sglang#27855 fixes is not on the path; that monkey-patch is intentionally not +# carried here and would be needed only when EP/DEP decode is enabled. +# +# Topology 1P1D, TP8/EP1, dp-attn false — the same conservative starting point the +# qwen3.5 / glm5 sglang-disagg recipes launched with. Starts at a single ISL/OSL +# (8k/1k) conc=1 as an end-to-end smoke test (does DSv4 + MoRI disagg come up and +# transfer KV at all on this image) before expanding to the full conc / DEP sweep. +dsv4-fp4-mi355x-sglang-disagg: + image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260610 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: mi355x-disagg + precision: fp4 + framework: sglang-disagg + multinode: true + disagg: true + scenarios: + fixed-seq-len: + - isl: 8192 + osl: 1024 + search-space: + # 1P1D TP8/EP1: 1 prefill node (co-located with router) + 1 decode node + - spec-decoding: "none" + conc-list: [ 1 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + # DSv4-Pro FP4 on MI355X via SGLang. Uses a rocm720 mi35x image built off the # amd/deepseek_v4 branch in sgl-project/sglang; the SHA is encoded in the # image tag, so bumping sglang is just an image tag bump here. Sweeps diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh index 71d2653bd..dbe2fa4db 100755 --- a/benchmarks/multi_node/amd_utils/env.sh +++ b/benchmarks/multi_node/amd_utils/env.sh @@ -149,6 +149,41 @@ else export SAFETENSORS_FAST_GPU=1 fi + # DeepSeek-V4-Pro (FP4 experts): the DSv4 SGLANG_* env block, copied verbatim + # from the validated 0610 single-node recipe (PR #1701, benchmarks/single_node/ + # fixed_seq_len/dsv4_fp4_mi355x_sglang.sh). That PR realigned DSv4 to the + # mainline ...mi35x-20260610 image (now that DSv4 support is on sglang main): + # the dsv4 attention backend, unified_kv_triton FlashMLA, the aiter indexer + # (not tilelang), and the mainline fp8 wo_a / topk-v2 fallbacks hardcoded + # (SGLANG_OPT_FP8_WO_A_GEMM=false, SGLANG_OPT_USE_TOPK_V2=false) instead of a + # deep_gemm-presence detect. Branch-only FP4 MoE flags (SGLANG_DSV4_FP4_EXPERTS, + # SGLANG_FORCE_TRITON_MOE_FP8) are dropped — DSv4 main no longer needs them. + # Gated on MODEL_NAME so other models are unaffected. + if [[ "$MODEL_NAME" == "DeepSeek-V4-Pro" ]]; then + export SGLANG_DEFAULT_THINKING=1 + export SGLANG_DSV4_REASONING_EFFORT=max + export SGLANG_OPT_DEEPGEMM_HC_PRENORM=false + export SGLANG_USE_AITER=1 + export SGLANG_USE_ROCM700A=0 + export SGLANG_OPT_USE_FUSED_COMPRESS=true + export SGLANG_HACK_FLASHMLA_BACKEND=unified_kv_triton + export SGLANG_OPT_FP8_WO_A_GEMM=false + export SGLANG_OPT_USE_JIT_INDEXER_METADATA=false + export SGLANG_OPT_USE_TOPK_V2=false + export SGLANG_OPT_USE_AITER_INDEXER=true + export SGLANG_OPT_USE_TILELANG_INDEXER=false + export SGLANG_OPT_USE_TILELANG_MHC_PRE=false + export SGLANG_OPT_USE_TILELANG_MHC_POST=false + export SGLANG_FP8_PAGED_MQA_LOGITS_TORCH=1 + export SGLANG_OPT_USE_FUSED_COMPRESS_TRITON=true + export AITER_BF16_FP8_MOE_BOUND=0 + export SGLANG_EAGER_INPUT_NO_COPY=true + + # multi-stream + export SGLANG_OPT_USE_MULTI_STREAM_OVERLAP=false + export SGLANG_ROCM_USE_MULTI_STREAM=false + fi + # Disable allocating memory in one pass export MORI_SHMEM_MODE=ISOLATION diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml index 605a377be..bd549dd54 100644 --- a/benchmarks/multi_node/amd_utils/models.yaml +++ b/benchmarks/multi_node/amd_utils/models.yaml @@ -349,3 +349,46 @@ DeepSeek-R1-0528-MXFP4-v2: max_running_requests: 128 chunked_prefill_size: 262144 cuda_graph_bs_range: "1-128" + +# DeepSeek-V4-Pro (FP4 experts) PD-disaggregation. Serving flags mirror the +# validated 0610 single-node SGLang recipe (PR #1701, dsv4_fp4_mi355x_sglang.sh): +# the dsv4 attention backend, SWA, page-size 256, the deepseekv4 tool-call / +# deepseek-v4 reasoning parsers, the DSv4 thinking chat template, and +# shared-experts-fusion disabled. The matching DSv4 SGLANG_* env block (#1701) +# lives in env.sh, and the config.json model_type patch (deepseek_v4 -> +# deepseek_v3) in setup_deps.sh, both gated on MODEL_NAME == DeepSeek-V4-Pro. +# --context-length is pinned (model default is very long; would over-reserve KV); +# 9472 covers the 8k/1k smoke point. kv-cache-dtype is left at the model default +# (the #1701 single-node recipe sets none), unlike the fp8_e4m3 DeepSeek-R1 +# disagg entries. The prefill delayer (--enable-prefill-delayer) is intentionally +# not used here. +DeepSeek-V4-Pro: + base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --disaggregation-transfer-backend mori --attention-backend dsv4 --swa-full-tokens-ratio 0.15 --page-size 256 --disable-shared-experts-fusion --tool-call-parser deepseekv4 --reasoning-parser deepseek-v4 --context-length 9472 --chat-template /workspace/benchmarks/single_node/chat_templates/deepseek_v4_thinking.jinja" + mtp_flags: "" + dp_flags: "--moe-a2a-backend mori --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" + prefill: + mem_fraction_static: 0.8 + disable_radix_cache: true + dp: + max_running_requests: 24 + chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_PREFILL * PREFILL_TP_SIZE" + cuda_graph_bs: "1 2 3" + no_dp: + max_running_requests: 32 + chunked_prefill_size: 8192 + cuda_graph_bs_range: "1-32" + decode: + mem_fraction_static: 0.85 + prefill_round_robin_balance: true + dp: + max_running_requests: 4096 + chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_DECODE * DECODE_TP_SIZE" + cuda_graph_bs_range: "1-160" + ep_only: + max_running_requests: 256 + chunked_prefill_size: 262144 + cuda_graph_bs_range: "1-256" + no_dp: + max_running_requests: 64 + chunked_prefill_size: 8192 + cuda_graph_bs_range: "1-64" diff --git a/benchmarks/multi_node/amd_utils/setup_deps.sh b/benchmarks/multi_node/amd_utils/setup_deps.sh index add2e3fa5..13714aa77 100644 --- a/benchmarks/multi_node/amd_utils/setup_deps.sh +++ b/benchmarks/multi_node/amd_utils/setup_deps.sh @@ -735,6 +735,54 @@ install_transformers_glm5() { _SETUP_INSTALLED+=("transformers-glm5") } +# --------------------------------------------------------------------------- +# SGLang: DeepSeek-V4-Pro config.json model_type patch. +# +# Transformers in these images doesn't recognize the `deepseek_v4` model_type, +# so AutoConfig.from_pretrained crashes before SGLang can dispatch. The +# single-node DSv4 recipes patch the HF-cache config.json directly; for disagg +# the weights live on shared NFS at $MODEL_DIR/$MODEL_NAME, so patch that +# config.json instead. Set model_type -> deepseek_v3 (so AutoConfig succeeds) +# while keeping architectures=['DeepseekV4ForCausalLM'] so SGLang still +# dispatches to its native DSv4 model class. +# +# Idempotent (no-op once model_type is deepseek_v3) and crash-safe under the +# concurrent multi-node start: writes a temp file in the same dir and os.replace() +# (atomic same-filesystem rename), so a reader never sees a half-written config. +# Only runs for MODEL_NAME == DeepSeek-V4-Pro. +# --------------------------------------------------------------------------- +patch_dsv4_config() { + if [[ "$MODEL_NAME" != "DeepSeek-V4-Pro" ]]; then + return 0 + fi + local cfg="${MODEL_DIR}/${MODEL_NAME}/config.json" + if [[ ! -f "$cfg" ]]; then + echo "[SETUP] WARN: DSv4 config.json not found at $cfg; skipping model_type patch" + return 0 + fi + python3 - "$cfg" <<'PYEOF' +import json, os, sys, tempfile +cfg = sys.argv[1] +with open(cfg) as f: + config = json.load(f) +if config.get("model_type") != "deepseek_v4": + print(f"[SETUP] DSv4 config.json already patched (model_type={config.get('model_type')!r})") + sys.exit(0) +config["model_type"] = "deepseek_v3" +d = os.path.dirname(cfg) +fd, tmp = tempfile.mkstemp(dir=d, prefix=".config.json.", suffix=".tmp") +try: + with os.fdopen(fd, "w") as f: + json.dump(config, f, indent=2) + os.replace(tmp, cfg) + print(f"[SETUP] Patched {cfg}: model_type deepseek_v4 -> deepseek_v3") +except Exception: + os.path.exists(tmp) and os.remove(tmp) + raise +PYEOF + _SETUP_INSTALLED+=("dsv4-config-model-type") +} + # ============================================================================= # Run installers (engine-gated) # ============================================================================= @@ -759,6 +807,7 @@ if [[ "$ENGINE" == "vllm-disagg" ]]; then else patch_gluon_pa_mqa_logits_instr_shape install_transformers_glm5 + patch_dsv4_config fi _SETUP_END=$(date +%s) diff --git a/benchmarks/multi_node/dsv4_fp4_mi355x_sglang-disagg.sh b/benchmarks/multi_node/dsv4_fp4_mi355x_sglang-disagg.sh new file mode 100755 index 000000000..e55559519 --- /dev/null +++ b/benchmarks/multi_node/dsv4_fp4_mi355x_sglang-disagg.sh @@ -0,0 +1,91 @@ +#!/usr/bin/env bash + +# DeepSeek-V4-Pro FP4 disaggregated prefill/decode on MI355X via SGLang + MoRI. +# Thin, model-agnostic launcher (same shape as the dsr1 / qwen3.5 / glm5 +# sglang-disagg wrappers): all serving flags live in the DeepSeek-V4-Pro entry +# of amd_utils/models.yaml, DSv4-specific env + the config.json model_type patch +# live in amd_utils/env.sh + setup_deps.sh, and topology (P/D node counts, TP/EP) +# comes from amd-master.yaml. + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + CONC_LIST \ + ISL \ + OSL \ + IMAGE \ + SPEC_DECODING \ + MODEL_PATH \ + PREFILL_NUM_WORKERS \ + PREFILL_TP \ + PREFILL_EP \ + PREFILL_DP_ATTN \ + DECODE_NUM_WORKERS \ + DECODE_TP \ + DECODE_EP \ + DECODE_DP_ATTN \ + PREFILL_NODES \ + DECODE_NODES \ + RANDOM_RANGE_RATIO \ + FRAMEWORK + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +set -x + +# Use upstreamed multi_node scripts (no external clone needed) +cd "$GITHUB_WORKSPACE/benchmarks/multi_node/amd_utils" || exit 1 + +# Set up SGL launch script-specific environment variables +export TIME_LIMIT="08:00:00" +export MODEL_PATH=$MODEL_PATH +export MODEL_NAME=$MODEL_NAME +export CONTAINER_IMAGE=$IMAGE + +if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then +export PREFILL_ENABLE_EP=false +else +export PREFILL_ENABLE_EP=true +fi + +if [[ "$PREFILL_DP_ATTN" == "true" ]]; then +export PREFILL_ENABLE_DP=true +else +export PREFILL_ENABLE_DP=false +fi + +if [[ "${DECODE_EP:-1}" -eq 1 ]]; then +export DECODE_ENABLE_EP=false +else +export DECODE_ENABLE_EP=true +fi + +if [[ "$DECODE_DP_ATTN" == "true" ]]; then +export DECODE_ENABLE_DP=true +else +export DECODE_ENABLE_DP=false +fi + +# Launch jobs based on ISL/OSL +# Replace ' ' in CONC_LIST with 'x' such that the concurrency list is represented +# by a list of numbers delimited by 'x'. This is because of how the underlying launch script +# expects the concurrencies. +JOB_ID=$(bash ./submit.sh $PREFILL_NODES \ + $PREFILL_NUM_WORKERS \ + $DECODE_NODES \ + $DECODE_NUM_WORKERS \ + $ISL $OSL "${CONC_LIST// /x}" inf \ + ${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \ + ${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \ + ${PREFILL_TP} ${DECODE_TP} \ + ${RANDOM_RANGE_RATIO} \ + ${NODE_LIST:-}) + +if [[ $? -ne 0 ]]; then + echo "Failed to submit job" >&2 + exit 1 +fi + +echo "$JOB_ID" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index e3fb6e94f..4cc6b5e7c 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3547,3 +3547,13 @@ - "MI355x DSR1-FP4: Include TP4 configurations for 8k1k" - "Expand the TP sweep (included TP=4) for 8k/1k configuration for conc=4 to 64" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1692 + +- config-keys: + - dsv4-fp4-mi355x-sglang-disagg + description: + - "New recipe: DeepSeek-V4-Pro FP4 prefill/decode-disaggregated on MI355X via SGLang + MoRI. Combines the validated single-node DSv4 SGLang recipe with the sglang-disagg framework from the dsr1/qwen3.5/glm5 mi355x recipes" + - "New benchmarks/multi_node/dsv4_fp4_mi355x_sglang-disagg.sh launcher + DeepSeek-V4-Pro entry in amd_utils/models.yaml. Serving flags + env mirror the validated 0610 single-node recipe (PR #1701): dsv4 attention backend, SWA, page-size 256, deepseekv4/deepseek-v4 parsers, DSv4 thinking chat template, shared-experts-fusion off, context-length pinned, kv-cache-dtype at model default; prefill delayer not used" + - "DSv4 SGLANG_* env block from PR #1701 added to amd_utils/env.sh (unified_kv_triton FlashMLA, aiter indexer, mainline fp8 wo_a / topk-v2 fallbacks hardcoded, multi-stream off; branch-only SGLANG_DSV4_FP4_EXPERTS/FORCE_TRITON_MOE_FP8 dropped); idempotent atomic config.json model_type patch (deepseek_v4->deepseek_v3) added to amd_utils/setup_deps.sh, both gated on MODEL_NAME" + - "Image lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260610 (the validated PR #1701 0610 image; mainline, carries DSv4 support + MoRI disagg backend, auto-applies the MoRI conn.py overlay; rocm/sgl-dev:*-DSv4 branch image lacks both)" + - "1P1D TP8/EP1 dp-attn false. Starts at a single ISL/OSL (8k/1k) conc=1 as an end-to-end smoke test before expanding the conc / DEP sweep" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1708