From c444e42d40adb1599916211e95f88bf86664bdae Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Mon, 8 Jun 2026 16:52:09 +0800 Subject: [PATCH 01/21] dsv4-fp4-b300-sglang: align env vars to GB300 and add fp4-indexer flag --- .../fixed_seq_len/dsv4_fp4_b300_sglang.sh | 42 ++++++++++++------- 1 file changed, 27 insertions(+), 15 deletions(-) diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh index b451dee0d..2079d4165 100755 --- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh @@ -30,8 +30,11 @@ fi nvidia-smi -# ─── Common env vars (all profiles) ─────────────────────────────────────────── -export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 +# ─── Common env vars (all profiles, GB300-aligned) ────────────────────────── +export SGLANG_JIT_DEEPGEMM_FAST_WARMUP=1 +export SGLANG_RADIX_FORCE_MISS=1 +export SGLANG_DEFAULT_THINKING=1 +export SGLANG_DSV4_REASONING_EFFORT=max export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1 SERVER_LOG="$PWD/server.log" @@ -46,9 +49,25 @@ fi start_gpu_monitor --output "$PWD/gpu_metrics.csv" +# ─── DP-attention env vars (GB300-aligned) ─────────────────────────────────── +# Shared across all DP-attention profiles (conc >= 512). Set before per-conc +# tuning so individual blocks only carry NVSHMEM / batch-size overrides. +if [ "$CONC" != "1" ] && [ "$CONC" != "32" ]; then + export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1 + export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1 + export SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_FP4_ACTS=1 + export SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_MXF4_KIND=1 + export SGLANG_OPT_USE_ONLINE_COMPRESS=1 + export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8192 + export SGLANG_LOG_FORWARD_ITERS=1 + export SGLANG_LOG_MS=1 + export SGLANG_REQUEST_STATE_WAIT_TIMEOUT=60 + export SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION=8 +fi + # ─── Per-concurrency launch profile ────────────────────────────────────────── # Each block sets: PARALLEL_ARGS, MEM_FRACTION_STATIC, SWA_FULL_TOKENS_RATIO, -# and optionally MAX_RUNNING_REQUESTS plus profile-specific env vars. +# and optionally MAX_RUNNING_REQUESTS. # # SWA ratio: 1k inputs need more SWA cache headroom than 8k inputs; 0.5 was # tuned empirically for the 1k1k recipe, while 0.1 is the cookbook default. @@ -61,11 +80,11 @@ if [ "$CONC" = "1" ] || [ "$CONC" = "32" ]; then --moe-runner-backend flashinfer_mxfp4 --chunked-prefill-size 8192 --disable-flashinfer-autotune + --enable-deepseek-v4-fp4-indexer ) elif [ "$CONC" = "512" ]; then # DP attention, flashinfer_mxfp4 - export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1 MEM_FRACTION_STATIC=0.94 SWA_FULL_TOKENS_RATIO=$([[ "$ISL" == "1024" ]] && echo 0.5 || echo 0.1) PARALLEL_ARGS=( @@ -75,15 +94,12 @@ elif [ "$CONC" = "512" ]; then --disable-flashinfer-autotune --chunked-prefill-size 16384 --enable-prefill-delayer + --enable-deepseek-v4-fp4-indexer ) elif [ "$CONC" = "2048" ]; then # DP attention, megamoe - export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1 export NVSHMEM_DISABLE_IB=1 - export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1 - export SGLANG_LOG_FORWARD_ITERS=1 - export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8320 MEM_FRACTION_STATIC=0.87 SWA_FULL_TOKENS_RATIO=0.06 MAX_RUNNING_REQUESTS=2560 @@ -95,14 +111,12 @@ elif [ "$CONC" = "2048" ]; then --chunked-prefill-size 65536 --tokenizer-worker-num 4 --enable-prefill-delayer + --enable-deepseek-v4-fp4-indexer ) elif [ "$CONC" = "4096" ]; then # DP attention, megamoe - export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1 export NVSHMEM_DISABLE_IB=1 - export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1 - export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8320 MEM_FRACTION_STATIC=0.835 SWA_FULL_TOKENS_RATIO=0.075 MAX_RUNNING_REQUESTS=4352 @@ -115,15 +129,12 @@ elif [ "$CONC" = "4096" ]; then --tokenizer-worker-num 8 --enable-prefill-delayer --decode-log-interval 5 + --enable-deepseek-v4-fp4-indexer ) elif [ "$CONC" = "8192" ]; then # DP attention, megamoe - export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1 export NVSHMEM_DISABLE_IB=1 - export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1 - export SGLANG_OPT_USE_ONLINE_COMPRESS=1 - export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8256 MEM_FRACTION_STATIC=0.80 SWA_FULL_TOKENS_RATIO=0.3 MAX_RUNNING_REQUESTS=8192 @@ -136,6 +147,7 @@ elif [ "$CONC" = "8192" ]; then --tokenizer-worker-num 16 --enable-prefill-delayer --stream-interval 30 + --enable-deepseek-v4-fp4-indexer ) else From 3ab8dc9286c8f4eb224802cca8602a73f93eec9f Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Mon, 8 Jun 2026 16:54:25 +0800 Subject: [PATCH 02/21] dsv4-fp4-b300-sglang: bump image to nightly-dev-cu13-20260608-303757cc --- .github/configs/nvidia-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index a02749d4d..f6374402c 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2003,7 +2003,7 @@ dsr1-fp8-b300-sglang: # DeepSeek-V4-Pro on B300 with sglang (non-MTP). # Uses nightly image with megamoe backend for high-concurrency profiles. dsv4-fp4-b300-sglang: - image: lmsysorg/sglang:nightly-dev-cu13-20260529-a8cfae0b + image: lmsysorg/sglang:nightly-dev-cu13-20260608-303757cc model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b300 From 250358c5189415d045e62aa3fc599d15493fd651 Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Mon, 8 Jun 2026 17:25:10 +0800 Subject: [PATCH 03/21] Add perf-changelog entry for dsv4-fp4-b300-sglang env var alignment --- perf-changelog.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 5622173f1..a4eebe53d 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3531,3 +3531,11 @@ - "The Rust frontend replaces only the Python serving/API layer (HTTP, tokenization, scheduling glue, detokenization) and spawns the same Python EngineCore, so GPU kernels/attention/MoE GEMM/KV cache are untouched" - "A/B sweep (28 single-node points, 1k1k + 8k1k, TP 1/2/4) vs the Python-frontend baseline (run 26696260751): throughput Pareto-neutral (peak tok/s/GPU within <1.5%, frontiers coincident) and TPOT flat (+-0.5%); TTFT improves ~8% at 1k1k and ~22% at 8k1k (every point), the expected signature of lower frontend CPU latency before first token, scaling with input length" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1634 + +- config-keys: + - dsv4-fp4-b300-sglang + description: + - "Align env vars to GB300: replace PRECOMPILE=0 with FAST_WARMUP=1, add RADIX_FORCE_MISS, DEFAULT_THINKING, DSV4_REASONING_EFFORT=max" + - "Add shared DP-attention env vars: MEGA_MOE_USE_FP4_ACTS, USE_MXF4_KIND, USE_ONLINE_COMPRESS, unified NUM_MAX_TOKENS_PER_RANK=8192" + - "Bump image to nightly-dev-cu13-20260601-373cadc9" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1682 From 96b2462b3ed3996f5128b4e4c0e03936377718df Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Tue, 9 Jun 2026 00:39:34 +0800 Subject: [PATCH 04/21] dsv4-fp4-b300-sglang: switch to nightly-dev-cu13-20260606-b3e4c204 --- .github/configs/nvidia-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index f6374402c..51ea64ff4 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2003,7 +2003,7 @@ dsr1-fp8-b300-sglang: # DeepSeek-V4-Pro on B300 with sglang (non-MTP). # Uses nightly image with megamoe backend for high-concurrency profiles. dsv4-fp4-b300-sglang: - image: lmsysorg/sglang:nightly-dev-cu13-20260608-303757cc + image: lmsysorg/sglang:nightly-dev-cu13-20260606-b3e4c204 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b300 From 9d8e10dff591d9dfd3c1bcb17632563eedbb73a8 Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Tue, 9 Jun 2026 01:19:51 +0800 Subject: [PATCH 05/21] dsv4-fp4-b300-sglang: switch to nightly-dev-cu13-20260604-14ed9b44 --- .github/configs/nvidia-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 51ea64ff4..bc6b5effb 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2003,7 +2003,7 @@ dsr1-fp8-b300-sglang: # DeepSeek-V4-Pro on B300 with sglang (non-MTP). # Uses nightly image with megamoe backend for high-concurrency profiles. dsv4-fp4-b300-sglang: - image: lmsysorg/sglang:nightly-dev-cu13-20260606-b3e4c204 + image: lmsysorg/sglang:nightly-dev-cu13-20260604-14ed9b44 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b300 From 50d1c91fdc6653a16a779dc8cdefe3a8fdc70c28 Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Tue, 9 Jun 2026 02:00:01 +0800 Subject: [PATCH 06/21] dsv4-fp4-b300-sglang: switch to nightly-dev-cu13-20260601-373cadc9 --- .github/configs/nvidia-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index bc6b5effb..d204aa7c1 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2003,7 +2003,7 @@ dsr1-fp8-b300-sglang: # DeepSeek-V4-Pro on B300 with sglang (non-MTP). # Uses nightly image with megamoe backend for high-concurrency profiles. dsv4-fp4-b300-sglang: - image: lmsysorg/sglang:nightly-dev-cu13-20260604-14ed9b44 + image: lmsysorg/sglang:nightly-dev-cu13-20260601-373cadc9 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b300 From 1025c7fabbba0898a1445ed056ab7915cf4ed49d Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Tue, 9 Jun 2026 07:03:48 +0800 Subject: [PATCH 07/21] dsv4-fp4-b300-sglang: remove --enable-deepseek-v4-fp4-indexer --- .../single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh index 2079d4165..7295f353c 100755 --- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh @@ -80,7 +80,7 @@ if [ "$CONC" = "1" ] || [ "$CONC" = "32" ]; then --moe-runner-backend flashinfer_mxfp4 --chunked-prefill-size 8192 --disable-flashinfer-autotune - --enable-deepseek-v4-fp4-indexer + ) elif [ "$CONC" = "512" ]; then @@ -94,7 +94,7 @@ elif [ "$CONC" = "512" ]; then --disable-flashinfer-autotune --chunked-prefill-size 16384 --enable-prefill-delayer - --enable-deepseek-v4-fp4-indexer + ) elif [ "$CONC" = "2048" ]; then @@ -111,7 +111,7 @@ elif [ "$CONC" = "2048" ]; then --chunked-prefill-size 65536 --tokenizer-worker-num 4 --enable-prefill-delayer - --enable-deepseek-v4-fp4-indexer + ) elif [ "$CONC" = "4096" ]; then @@ -129,7 +129,7 @@ elif [ "$CONC" = "4096" ]; then --tokenizer-worker-num 8 --enable-prefill-delayer --decode-log-interval 5 - --enable-deepseek-v4-fp4-indexer + ) elif [ "$CONC" = "8192" ]; then @@ -147,7 +147,7 @@ elif [ "$CONC" = "8192" ]; then --tokenizer-worker-num 16 --enable-prefill-delayer --stream-interval 30 - --enable-deepseek-v4-fp4-indexer + ) else From 0e02411b00e5a7307aeea69b09ba9bd9a7e1b286 Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Tue, 9 Jun 2026 07:40:36 +0800 Subject: [PATCH 08/21] dsv4-fp4-b300-sglang: revert image to original nightly-dev-cu13-20260529-a8cfae0b --- .github/configs/nvidia-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index d204aa7c1..a02749d4d 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2003,7 +2003,7 @@ dsr1-fp8-b300-sglang: # DeepSeek-V4-Pro on B300 with sglang (non-MTP). # Uses nightly image with megamoe backend for high-concurrency profiles. dsv4-fp4-b300-sglang: - image: lmsysorg/sglang:nightly-dev-cu13-20260601-373cadc9 + image: lmsysorg/sglang:nightly-dev-cu13-20260529-a8cfae0b model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b300 From c65520a787a92d00a7e068e40d8fd78302103f8f Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Tue, 9 Jun 2026 11:13:26 +0800 Subject: [PATCH 09/21] dsv4-fp4-b300-sglang: switch to nightly-dev-cu13-20260609-317fc6a9 --- .github/configs/nvidia-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index a02749d4d..56b11c889 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2003,7 +2003,7 @@ dsr1-fp8-b300-sglang: # DeepSeek-V4-Pro on B300 with sglang (non-MTP). # Uses nightly image with megamoe backend for high-concurrency profiles. dsv4-fp4-b300-sglang: - image: lmsysorg/sglang:nightly-dev-cu13-20260529-a8cfae0b + image: lmsysorg/sglang:nightly-dev-cu13-20260609-317fc6a9 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b300 From 6beee8c23f232ef6a6a2ef2fe719ef081afd9773 Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Tue, 9 Jun 2026 14:06:03 +0800 Subject: [PATCH 10/21] dsv4-fp4-b300-sglang: kill stale processes on server ports before launch --- benchmarks/benchmark_lib.sh | 54 +++++++++++++++++++ .../fixed_seq_len/dsv4_fp4_b300_sglang.sh | 2 + 2 files changed, 56 insertions(+) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index e3080b4bf..95c8d7217 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -78,6 +78,60 @@ stop_gpu_monitor() { # Check if required environment variables are set # Usage: check_env_vars VAR1 VAR2 VAR3 ... # Exits with code 1 if any variable is not set +# Kill any process listening on the given TCP port. +# Usage: kill_port_users PORT [PORT ...] +# Silently succeeds if the port is already free. +kill_port_users() { + for port in "$@"; do + local pids + # lsof is the most portable way; fall back to fuser / ss+awk. + if command -v lsof >/dev/null 2>&1; then + pids=$(lsof -ti "tcp:$port" 2>/dev/null || true) + elif command -v fuser >/dev/null 2>&1; then + pids=$(fuser "$port/tcp" 2>/dev/null | tr -s ' ' '\n' || true) + elif command -v ss >/dev/null 2>&1; then + pids=$(ss -tlnp "sport = :$port" 2>/dev/null \ + | awk -F'pid=' 'NR>1 && $2{split($2,a,","); print a[1]}' || true) + fi + + pids=$(echo "$pids" | xargs) # trim whitespace + if [[ -z "$pids" ]]; then + continue + fi + + echo "[cleanup] Killing process(es) on port $port: $pids" + # SIGTERM first, give 3s, then SIGKILL + kill $pids 2>/dev/null || true + for _i in $(seq 1 6); do + if ! lsof -ti "tcp:$port" >/dev/null 2>&1 && \ + ! fuser "$port/tcp" >/dev/null 2>&1; then + break + fi + sleep 0.5 + done + kill -9 $pids 2>/dev/null || true + done +} + +# Kill stale processes on the server port and the range of ZMQ/NCCL ports +# that sglang derives from it (port+233 … port+239). Call before launching +# the inference server to avoid "port not available" errors from leftover +# processes of a prior run. +# Usage: cleanup_server_ports [PORT] (defaults to $PORT / 8888) +cleanup_server_ports() { + local base="${1:-${PORT:-8888}}" + local zmq_base=$((base + 234)) # port + ZMQ_TCP_PORT_DELTA(233) + 1 + local ports=("$base") + for offset in $(seq 0 5); do + ports+=($((zmq_base + offset))) + done + # Also kill the NCCL port range (base + 234 + 100 … +108 for tp=8) + for offset in $(seq 100 108); do + ports+=($((zmq_base + offset))) + done + kill_port_users "${ports[@]}" +} + check_env_vars() { local missing_vars=() diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh index 7295f353c..447cef0c6 100755 --- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh @@ -163,6 +163,8 @@ fi echo "===================================" } | tee "$SERVER_LOG" +cleanup_server_ports + set -x PYTHONNOUSERSITE=1 sglang serve \ --model-path $MODEL_PATH --served-model-name $MODEL \ From 5bd18adc9f982b19b2b985a5b4b6d2e0e3d96c16 Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Tue, 9 Jun 2026 14:30:32 +0800 Subject: [PATCH 11/21] benchmark: kill stale server processes before launch DP-attention mode uses deterministic TCP ports derived from $PORT. If a previous sglang process didn't exit cleanly (e.g. Slurm scancel didn't fully propagate SIGTERM to all subprocesses), the next launch fails with "metrics_port at 9125 is not available". Non-DP-attention mode is unaffected because it uses ipc:// with random temp paths. Add kill_stale_servers() to benchmark_lib.sh and call it from all sglang benchmark scripts that use DP-attention (B300 and B200). --- benchmarks/benchmark_lib.sh | 22 +++++++++++++++++++ .../fixed_seq_len/dsv4_fp4_b200.sh | 2 ++ .../fixed_seq_len/dsv4_fp4_b300_sglang.sh | 2 +- .../fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh | 2 ++ 4 files changed, 27 insertions(+), 1 deletion(-) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 95c8d7217..f6509c865 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -16,6 +16,28 @@ mkdir -p "$PYTHONPYCACHEPREFIX" 2>/dev/null || true # nothing upstream set it. export PORT="${PORT:-8888}" +# -------------------------------- +# Stale server cleanup +# -------------------------------- +# Kill leftover inference-server processes from a prior run on this node. +# DP-attention mode derives deterministic TCP ports from $PORT, so a stale +# sglang process that didn't die cleanly will block the next launch. +# This runs on the compute node (inside srun), where the processes live. +kill_stale_servers() { + echo "[Cleanup] Killing stale inference-server processes ..." + # Kill by port: main server port and DP-attention ZMQ ports (port+234..port+238) + local _port + for _port in "$PORT" $(seq $((PORT + 234)) $((PORT + 238))); do + fuser -k "$_port/tcp" 2>/dev/null || true + done + # Belt-and-suspenders: kill any remaining sglang/vllm serve processes + pkill -9 -f "sglang serve" 2>/dev/null || true + pkill -9 -f "sglang.srt" 2>/dev/null || true + pkill -9 -f "vllm serve" 2>/dev/null || true + sleep 2 + echo "[Cleanup] Done." +} + # -------------------------------- # GPU monitoring helpers # -------------------------------- diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200.sh index e1d031854..6c0fee403 100755 --- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200.sh @@ -82,6 +82,8 @@ fi echo "===================================" } | tee "$SERVER_LOG" +kill_stale_servers + set -x PYTHONNOUSERSITE=1 sglang serve \ --model-path $MODEL \ diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh index 447cef0c6..042602074 100755 --- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh @@ -163,7 +163,7 @@ fi echo "===================================" } | tee "$SERVER_LOG" -cleanup_server_ports +kill_stale_servers set -x PYTHONNOUSERSITE=1 sglang serve \ diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh index 672d48f4b..f2c25148b 100755 --- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh @@ -124,6 +124,8 @@ fi echo "===================================" } | tee "$SERVER_LOG" +kill_stale_servers + set -x PYTHONNOUSERSITE=1 sglang serve \ --model-path $MODEL_PATH --served-model-name $MODEL \ From c27f53faefe81ea1db4cc7dafd25d1f8b37c23ff Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Tue, 9 Jun 2026 16:28:40 +0800 Subject: [PATCH 12/21] benchmark: use python3+psutil to kill stale server by port fuser/pkill may not be available in all containers. Use python3+psutil (always present in sglang/vllm images) to precisely kill processes on $PORT and $PORT+237 (the DP-attention metrics_port = PORT + ZMQ_TCP_PORT_DELTA + 4). --- benchmarks/benchmark_lib.sh | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index f6509c865..e44c19f9f 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -20,20 +20,27 @@ export PORT="${PORT:-8888}" # Stale server cleanup # -------------------------------- # Kill leftover inference-server processes from a prior run on this node. -# DP-attention mode derives deterministic TCP ports from $PORT, so a stale -# sglang process that didn't die cleanly will block the next launch. -# This runs on the compute node (inside srun), where the processes live. +# DP-attention mode derives deterministic TCP ports from $PORT: +# metrics_port = PORT + 233 (ZMQ_TCP_PORT_DELTA) + 1 + 3 = PORT + 237 +# A stale sglang process holding that port blocks the next launch. kill_stale_servers() { echo "[Cleanup] Killing stale inference-server processes ..." - # Kill by port: main server port and DP-attention ZMQ ports (port+234..port+238) - local _port - for _port in "$PORT" $(seq $((PORT + 234)) $((PORT + 238))); do - fuser -k "$_port/tcp" 2>/dev/null || true - done - # Belt-and-suspenders: kill any remaining sglang/vllm serve processes - pkill -9 -f "sglang serve" 2>/dev/null || true - pkill -9 -f "sglang.srt" 2>/dev/null || true - pkill -9 -f "vllm serve" 2>/dev/null || true + # Use python3+psutil (always present in sglang/vllm images) to find and + # kill any process listening on our server port or the derived metrics port. + python3 -c " +import os, signal, psutil +targets = {int(os.environ.get('PORT', 8888))} +targets.add(min(targets) + 237) # metrics_port +killed = set() +for c in psutil.net_connections('inet'): + if c.laddr.port in targets and c.pid and c.pid not in killed: + try: + os.kill(c.pid, signal.SIGKILL) + killed.add(c.pid) + print(f' killed pid {c.pid} (port {c.laddr.port})') + except OSError: + pass +" 2>/dev/null || true sleep 2 echo "[Cleanup] Done." } From cd2f55f712585015822903f9fe7b11fbe79f6fe2 Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Tue, 9 Jun 2026 16:56:42 +0800 Subject: [PATCH 13/21] fix stale process cleanup: run pkill on host, not inside container The previous kill_stale_servers ran inside the Pyxis container, which cannot see or kill processes in the host PID namespace. Move the cleanup to launch_b300-nv.sh as a bare srun (no --container-image) so pkill runs directly on the compute node and can kill leftover sglang/vllm processes from prior container runs. --- benchmarks/benchmark_lib.sh | 29 ------------------- .../fixed_seq_len/dsv4_fp4_b200.sh | 2 -- .../fixed_seq_len/dsv4_fp4_b300_sglang.sh | 2 -- .../fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh | 2 -- runners/launch_b300-nv.sh | 7 +++++ 5 files changed, 7 insertions(+), 35 deletions(-) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index e44c19f9f..95c8d7217 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -16,35 +16,6 @@ mkdir -p "$PYTHONPYCACHEPREFIX" 2>/dev/null || true # nothing upstream set it. export PORT="${PORT:-8888}" -# -------------------------------- -# Stale server cleanup -# -------------------------------- -# Kill leftover inference-server processes from a prior run on this node. -# DP-attention mode derives deterministic TCP ports from $PORT: -# metrics_port = PORT + 233 (ZMQ_TCP_PORT_DELTA) + 1 + 3 = PORT + 237 -# A stale sglang process holding that port blocks the next launch. -kill_stale_servers() { - echo "[Cleanup] Killing stale inference-server processes ..." - # Use python3+psutil (always present in sglang/vllm images) to find and - # kill any process listening on our server port or the derived metrics port. - python3 -c " -import os, signal, psutil -targets = {int(os.environ.get('PORT', 8888))} -targets.add(min(targets) + 237) # metrics_port -killed = set() -for c in psutil.net_connections('inet'): - if c.laddr.port in targets and c.pid and c.pid not in killed: - try: - os.kill(c.pid, signal.SIGKILL) - killed.add(c.pid) - print(f' killed pid {c.pid} (port {c.laddr.port})') - except OSError: - pass -" 2>/dev/null || true - sleep 2 - echo "[Cleanup] Done." -} - # -------------------------------- # GPU monitoring helpers # -------------------------------- diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200.sh index 6c0fee403..e1d031854 100755 --- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200.sh @@ -82,8 +82,6 @@ fi echo "===================================" } | tee "$SERVER_LOG" -kill_stale_servers - set -x PYTHONNOUSERSITE=1 sglang serve \ --model-path $MODEL \ diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh index 042602074..7295f353c 100755 --- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh @@ -163,8 +163,6 @@ fi echo "===================================" } | tee "$SERVER_LOG" -kill_stale_servers - set -x PYTHONNOUSERSITE=1 sglang serve \ --model-path $MODEL_PATH --served-model-name $MODEL \ diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh index f2c25148b..672d48f4b 100755 --- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh @@ -124,8 +124,6 @@ fi echo "===================================" } | tee "$SERVER_LOG" -kill_stale_servers - set -x PYTHONNOUSERSITE=1 sglang serve \ --model-path $MODEL_PATH --served-model-name $MODEL \ diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh index fc0ac297f..f60dae907 100644 --- a/runners/launch_b300-nv.sh +++ b/runners/launch_b300-nv.sh @@ -408,6 +408,13 @@ else salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT -N 1 --gres=gpu:$TP --exclusive --time="${SALLOC_TIME_LIMIT:-180}" --no-shell --job-name="$RUNNER_NAME" JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1) + # Kill stale sglang/vllm processes on the compute node (outside container). + # DP-attention uses deterministic TCP ports; a leftover process blocks the + # next launch with "metrics_port not available". + srun --jobid=$JOB_ID --mpi=none \ + bash -c 'echo "[Host cleanup] killing stale inference processes ..."; pkill -9 -f "sglang" 2>/dev/null; pkill -9 -f "vllm" 2>/dev/null; sleep 2; echo "[Host cleanup] done."' \ + || true + srun --jobid=$JOB_ID \ --mpi=none \ --container-image=$SQUASH_FILE \ From 276647bd63c1a775d52757c131701b2e3065005d Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Tue, 9 Jun 2026 17:10:28 +0800 Subject: [PATCH 14/21] b300: fix pkill self-kill and change PORT to 30000 Two fixes: - pkill -f "sglang" matched its own bash -c argv and killed itself before reaching the stale process. Use [s]glang regex trick so pkill skips its own process. - Change PORT from 8888 to 30000 to avoid conflict with stale processes still bound to 8888-derived ports (metrics_port 9125). --- runners/launch_b300-nv.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh index f60dae907..e1088d8be 100644 --- a/runners/launch_b300-nv.sh +++ b/runners/launch_b300-nv.sh @@ -411,8 +411,10 @@ else # Kill stale sglang/vllm processes on the compute node (outside container). # DP-attention uses deterministic TCP ports; a leftover process blocks the # next launch with "metrics_port not available". + # [s]glang trick: the regex [s]glang matches "sglang" but not the literal + # "[s]glang" in pkill's own argv, preventing pkill from killing itself. srun --jobid=$JOB_ID --mpi=none \ - bash -c 'echo "[Host cleanup] killing stale inference processes ..."; pkill -9 -f "sglang" 2>/dev/null; pkill -9 -f "vllm" 2>/dev/null; sleep 2; echo "[Host cleanup] done."' \ + bash -c 'echo "[Host cleanup] killing stale inference processes ..."; pkill -9 -f "[s]glang" 2>/dev/null; pkill -9 -f "[v]llm" 2>/dev/null; sleep 2; echo "[Host cleanup] done."' \ || true srun --jobid=$JOB_ID \ @@ -421,7 +423,7 @@ else --container-mounts=$GITHUB_WORKSPACE:$CONTAINER_MOUNT_DIR,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE_MOUNT,$WRITABLE_MODELS_DIR:$WRITABLE_MODELS_DIR \ --no-container-mount-home \ --container-workdir=$CONTAINER_MOUNT_DIR \ - --no-container-entrypoint --export=ALL,PORT=8888 \ + --no-container-entrypoint --export=ALL,PORT=30000 \ bash "$BENCH_SCRIPT" fi From 27d5fe9ebfb886121d0bff8491cbe005f08bf902 Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Tue, 9 Jun 2026 19:14:37 +0800 Subject: [PATCH 15/21] dsv4-fp4-b300-sglang: remove kill logic, add --enable-deepseek-v4-fp4-indexer --- .../single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh | 10 +++++----- runners/launch_b300-nv.sh | 9 --------- 2 files changed, 5 insertions(+), 14 deletions(-) diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh index 7295f353c..2079d4165 100755 --- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh @@ -80,7 +80,7 @@ if [ "$CONC" = "1" ] || [ "$CONC" = "32" ]; then --moe-runner-backend flashinfer_mxfp4 --chunked-prefill-size 8192 --disable-flashinfer-autotune - + --enable-deepseek-v4-fp4-indexer ) elif [ "$CONC" = "512" ]; then @@ -94,7 +94,7 @@ elif [ "$CONC" = "512" ]; then --disable-flashinfer-autotune --chunked-prefill-size 16384 --enable-prefill-delayer - + --enable-deepseek-v4-fp4-indexer ) elif [ "$CONC" = "2048" ]; then @@ -111,7 +111,7 @@ elif [ "$CONC" = "2048" ]; then --chunked-prefill-size 65536 --tokenizer-worker-num 4 --enable-prefill-delayer - + --enable-deepseek-v4-fp4-indexer ) elif [ "$CONC" = "4096" ]; then @@ -129,7 +129,7 @@ elif [ "$CONC" = "4096" ]; then --tokenizer-worker-num 8 --enable-prefill-delayer --decode-log-interval 5 - + --enable-deepseek-v4-fp4-indexer ) elif [ "$CONC" = "8192" ]; then @@ -147,7 +147,7 @@ elif [ "$CONC" = "8192" ]; then --tokenizer-worker-num 16 --enable-prefill-delayer --stream-interval 30 - + --enable-deepseek-v4-fp4-indexer ) else diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh index e1088d8be..665160fd3 100644 --- a/runners/launch_b300-nv.sh +++ b/runners/launch_b300-nv.sh @@ -408,15 +408,6 @@ else salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT -N 1 --gres=gpu:$TP --exclusive --time="${SALLOC_TIME_LIMIT:-180}" --no-shell --job-name="$RUNNER_NAME" JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1) - # Kill stale sglang/vllm processes on the compute node (outside container). - # DP-attention uses deterministic TCP ports; a leftover process blocks the - # next launch with "metrics_port not available". - # [s]glang trick: the regex [s]glang matches "sglang" but not the literal - # "[s]glang" in pkill's own argv, preventing pkill from killing itself. - srun --jobid=$JOB_ID --mpi=none \ - bash -c 'echo "[Host cleanup] killing stale inference processes ..."; pkill -9 -f "[s]glang" 2>/dev/null; pkill -9 -f "[v]llm" 2>/dev/null; sleep 2; echo "[Host cleanup] done."' \ - || true - srun --jobid=$JOB_ID \ --mpi=none \ --container-image=$SQUASH_FILE \ From 30211e40d6f1cf1303eb0ad339ce052627fbd2cb Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Tue, 9 Jun 2026 23:42:46 +0800 Subject: [PATCH 16/21] dsv4-fp4-b300-sglang: add --enforce-piecewise-cuda-graph --enable-mixed-chunk --- .../single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh index 2079d4165..6a56d9fc2 100755 --- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh @@ -81,6 +81,8 @@ if [ "$CONC" = "1" ] || [ "$CONC" = "32" ]; then --chunked-prefill-size 8192 --disable-flashinfer-autotune --enable-deepseek-v4-fp4-indexer + --enforce-piecewise-cuda-graph + --enable-mixed-chunk ) elif [ "$CONC" = "512" ]; then @@ -95,6 +97,8 @@ elif [ "$CONC" = "512" ]; then --chunked-prefill-size 16384 --enable-prefill-delayer --enable-deepseek-v4-fp4-indexer + --enforce-piecewise-cuda-graph + --enable-mixed-chunk ) elif [ "$CONC" = "2048" ]; then @@ -112,6 +116,8 @@ elif [ "$CONC" = "2048" ]; then --tokenizer-worker-num 4 --enable-prefill-delayer --enable-deepseek-v4-fp4-indexer + --enforce-piecewise-cuda-graph + --enable-mixed-chunk ) elif [ "$CONC" = "4096" ]; then @@ -130,6 +136,8 @@ elif [ "$CONC" = "4096" ]; then --enable-prefill-delayer --decode-log-interval 5 --enable-deepseek-v4-fp4-indexer + --enforce-piecewise-cuda-graph + --enable-mixed-chunk ) elif [ "$CONC" = "8192" ]; then @@ -148,6 +156,8 @@ elif [ "$CONC" = "8192" ]; then --enable-prefill-delayer --stream-interval 30 --enable-deepseek-v4-fp4-indexer + --enforce-piecewise-cuda-graph + --enable-mixed-chunk ) else From 48af571260914325a6058cbb7f5f0ada3b6bbb9a Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Tue, 9 Jun 2026 23:46:52 +0800 Subject: [PATCH 17/21] perf-changelog: add entry for piecewise cuda graph PR #1693 --- perf-changelog.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index a4eebe53d..920069783 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3539,3 +3539,9 @@ - "Add shared DP-attention env vars: MEGA_MOE_USE_FP4_ACTS, USE_MXF4_KIND, USE_ONLINE_COMPRESS, unified NUM_MAX_TOKENS_PER_RANK=8192" - "Bump image to nightly-dev-cu13-20260601-373cadc9" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1682 + +- config-keys: + - dsv4-fp4-b300-sglang + description: + - "Add --enforce-piecewise-cuda-graph and --enable-mixed-chunk to all concurrency profiles" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1693 From 38653b9be2abe00bf8a2e27dfe571ac59cfea0b2 Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Tue, 9 Jun 2026 23:49:15 +0800 Subject: [PATCH 18/21] benchmark_lib: remove stale kill_port_users/cleanup_server_ports --- benchmarks/benchmark_lib.sh | 54 ------------------------------------- 1 file changed, 54 deletions(-) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 95c8d7217..e3080b4bf 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -78,60 +78,6 @@ stop_gpu_monitor() { # Check if required environment variables are set # Usage: check_env_vars VAR1 VAR2 VAR3 ... # Exits with code 1 if any variable is not set -# Kill any process listening on the given TCP port. -# Usage: kill_port_users PORT [PORT ...] -# Silently succeeds if the port is already free. -kill_port_users() { - for port in "$@"; do - local pids - # lsof is the most portable way; fall back to fuser / ss+awk. - if command -v lsof >/dev/null 2>&1; then - pids=$(lsof -ti "tcp:$port" 2>/dev/null || true) - elif command -v fuser >/dev/null 2>&1; then - pids=$(fuser "$port/tcp" 2>/dev/null | tr -s ' ' '\n' || true) - elif command -v ss >/dev/null 2>&1; then - pids=$(ss -tlnp "sport = :$port" 2>/dev/null \ - | awk -F'pid=' 'NR>1 && $2{split($2,a,","); print a[1]}' || true) - fi - - pids=$(echo "$pids" | xargs) # trim whitespace - if [[ -z "$pids" ]]; then - continue - fi - - echo "[cleanup] Killing process(es) on port $port: $pids" - # SIGTERM first, give 3s, then SIGKILL - kill $pids 2>/dev/null || true - for _i in $(seq 1 6); do - if ! lsof -ti "tcp:$port" >/dev/null 2>&1 && \ - ! fuser "$port/tcp" >/dev/null 2>&1; then - break - fi - sleep 0.5 - done - kill -9 $pids 2>/dev/null || true - done -} - -# Kill stale processes on the server port and the range of ZMQ/NCCL ports -# that sglang derives from it (port+233 … port+239). Call before launching -# the inference server to avoid "port not available" errors from leftover -# processes of a prior run. -# Usage: cleanup_server_ports [PORT] (defaults to $PORT / 8888) -cleanup_server_ports() { - local base="${1:-${PORT:-8888}}" - local zmq_base=$((base + 234)) # port + ZMQ_TCP_PORT_DELTA(233) + 1 - local ports=("$base") - for offset in $(seq 0 5); do - ports+=($((zmq_base + offset))) - done - # Also kill the NCCL port range (base + 234 + 100 … +108 for tp=8) - for offset in $(seq 100 108); do - ports+=($((zmq_base + offset))) - done - kill_port_users "${ports[@]}" -} - check_env_vars() { local missing_vars=() From abf43499033701ca3c9bcb74a878b66ab14f6c29 Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Wed, 10 Jun 2026 00:01:23 +0800 Subject: [PATCH 19/21] dsv4-fp4-b300-sglang: replace piecewise-cuda-graph/mixed-chunk with flashinfer-allreduce-fusion --- .../fixed_seq_len/dsv4_fp4_b300_sglang.sh | 15 +++++---------- perf-changelog.yaml | 2 +- 2 files changed, 6 insertions(+), 11 deletions(-) diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh index 6a56d9fc2..fc55899df 100755 --- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh @@ -81,8 +81,7 @@ if [ "$CONC" = "1" ] || [ "$CONC" = "32" ]; then --chunked-prefill-size 8192 --disable-flashinfer-autotune --enable-deepseek-v4-fp4-indexer - --enforce-piecewise-cuda-graph - --enable-mixed-chunk + --enable-flashinfer-allreduce-fusion ) elif [ "$CONC" = "512" ]; then @@ -97,8 +96,7 @@ elif [ "$CONC" = "512" ]; then --chunked-prefill-size 16384 --enable-prefill-delayer --enable-deepseek-v4-fp4-indexer - --enforce-piecewise-cuda-graph - --enable-mixed-chunk + --enable-flashinfer-allreduce-fusion ) elif [ "$CONC" = "2048" ]; then @@ -116,8 +114,7 @@ elif [ "$CONC" = "2048" ]; then --tokenizer-worker-num 4 --enable-prefill-delayer --enable-deepseek-v4-fp4-indexer - --enforce-piecewise-cuda-graph - --enable-mixed-chunk + --enable-flashinfer-allreduce-fusion ) elif [ "$CONC" = "4096" ]; then @@ -136,8 +133,7 @@ elif [ "$CONC" = "4096" ]; then --enable-prefill-delayer --decode-log-interval 5 --enable-deepseek-v4-fp4-indexer - --enforce-piecewise-cuda-graph - --enable-mixed-chunk + --enable-flashinfer-allreduce-fusion ) elif [ "$CONC" = "8192" ]; then @@ -156,8 +152,7 @@ elif [ "$CONC" = "8192" ]; then --enable-prefill-delayer --stream-interval 30 --enable-deepseek-v4-fp4-indexer - --enforce-piecewise-cuda-graph - --enable-mixed-chunk + --enable-flashinfer-allreduce-fusion ) else diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 920069783..4b3f347d8 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3543,5 +3543,5 @@ - config-keys: - dsv4-fp4-b300-sglang description: - - "Add --enforce-piecewise-cuda-graph and --enable-mixed-chunk to all concurrency profiles" + - "Add --enable-flashinfer-allreduce-fusion to all concurrency profiles" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1693 From f2724b9ad60e597318ae1a7640ef3906c3d2e11f Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Wed, 10 Jun 2026 09:14:02 +0800 Subject: [PATCH 20/21] Replace --enable-flashinfer-allreduce-fusion with --enable-mixed-chunk, set chunked-prefill-size 16384 for high-conc profiles --- .../fixed_seq_len/dsv4_fp4_b300_sglang.sh | 16 ++++++++-------- perf-changelog.yaml | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh index fc55899df..2efe2d2e0 100755 --- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh @@ -81,7 +81,7 @@ if [ "$CONC" = "1" ] || [ "$CONC" = "32" ]; then --chunked-prefill-size 8192 --disable-flashinfer-autotune --enable-deepseek-v4-fp4-indexer - --enable-flashinfer-allreduce-fusion + --enable-mixed-chunk ) elif [ "$CONC" = "512" ]; then @@ -96,7 +96,7 @@ elif [ "$CONC" = "512" ]; then --chunked-prefill-size 16384 --enable-prefill-delayer --enable-deepseek-v4-fp4-indexer - --enable-flashinfer-allreduce-fusion + --enable-mixed-chunk ) elif [ "$CONC" = "2048" ]; then @@ -110,11 +110,11 @@ elif [ "$CONC" = "2048" ]; then --enable-dp-attention --moe-a2a-backend megamoe --cuda-graph-max-bs 288 - --chunked-prefill-size 65536 + --chunked-prefill-size 16384 --tokenizer-worker-num 4 --enable-prefill-delayer --enable-deepseek-v4-fp4-indexer - --enable-flashinfer-allreduce-fusion + --enable-mixed-chunk ) elif [ "$CONC" = "4096" ]; then @@ -128,12 +128,12 @@ elif [ "$CONC" = "4096" ]; then --enable-dp-attention --moe-a2a-backend megamoe --cuda-graph-max-bs 544 - --chunked-prefill-size 65536 + --chunked-prefill-size 16384 --tokenizer-worker-num 8 --enable-prefill-delayer --decode-log-interval 5 --enable-deepseek-v4-fp4-indexer - --enable-flashinfer-allreduce-fusion + --enable-mixed-chunk ) elif [ "$CONC" = "8192" ]; then @@ -147,12 +147,12 @@ elif [ "$CONC" = "8192" ]; then --enable-dp-attention --moe-a2a-backend megamoe --cuda-graph-max-bs 1088 - --chunked-prefill-size 65536 + --chunked-prefill-size 16384 --tokenizer-worker-num 16 --enable-prefill-delayer --stream-interval 30 --enable-deepseek-v4-fp4-indexer - --enable-flashinfer-allreduce-fusion + --enable-mixed-chunk ) else diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 4b3f347d8..4495de831 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3543,5 +3543,5 @@ - config-keys: - dsv4-fp4-b300-sglang description: - - "Add --enable-flashinfer-allreduce-fusion to all concurrency profiles" + - "Add --enable-mixed-chunk to all concurrency profiles, set chunked-prefill-size 16384 for high-conc megamoe profiles" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1693 From 1bdee896a0ff50e475344e7a9902330bcd0ebfdd Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Wed, 10 Jun 2026 12:42:59 +0800 Subject: [PATCH 21/21] Remove CONC=4096 profile from dsv4-fp4-b300-sglang --- .github/configs/nvidia-master.yaml | 1 - .../fixed_seq_len/dsv4_fp4_b300_sglang.sh | 19 ------------------- 2 files changed, 20 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 56b11c889..a2c7f8631 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2033,7 +2033,6 @@ dsv4-fp4-b300-sglang: - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 } - { tp: 4, ep: 1, dp-attn: true, conc-start: 512, conc-end: 512 } - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 4096, conc-end: 4096 } # DeepSeek-V4-Pro on B300 with EAGLE/MTP speculative decoding. Recipe is # selected inside benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh by diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh index 2efe2d2e0..a52f2d6f8 100755 --- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh @@ -117,25 +117,6 @@ elif [ "$CONC" = "2048" ]; then --enable-mixed-chunk ) -elif [ "$CONC" = "4096" ]; then - # DP attention, megamoe - export NVSHMEM_DISABLE_IB=1 - MEM_FRACTION_STATIC=0.835 - SWA_FULL_TOKENS_RATIO=0.075 - MAX_RUNNING_REQUESTS=4352 - PARALLEL_ARGS=( - --dp-size "$TP" - --enable-dp-attention - --moe-a2a-backend megamoe - --cuda-graph-max-bs 544 - --chunked-prefill-size 16384 - --tokenizer-worker-num 8 - --enable-prefill-delayer - --decode-log-interval 5 - --enable-deepseek-v4-fp4-indexer - --enable-mixed-chunk - ) - elif [ "$CONC" = "8192" ]; then # DP attention, megamoe export NVSHMEM_DISABLE_IB=1