From c444e42d40adb1599916211e95f88bf86664bdae Mon Sep 17 00:00:00 2001
From: yhyang201 <yhyang201@gmail.com>
Date: Mon, 8 Jun 2026 16:52:09 +0800
Subject: [PATCH 01/21] dsv4-fp4-b300-sglang: align env vars to GB300 and add
 fp4-indexer flag

---
 .../fixed_seq_len/dsv4_fp4_b300_sglang.sh     | 42 ++++++++++++-------
 1 file changed, 27 insertions(+), 15 deletions(-)

diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh
index b451dee0d..2079d4165 100755
--- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh
@@ -30,8 +30,11 @@ fi
 
 nvidia-smi
 
-# ─── Common env vars (all profiles) ───────────────────────────────────────────
-export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0
+# ─── Common env vars (all profiles, GB300-aligned) ──────────────────────────
+export SGLANG_JIT_DEEPGEMM_FAST_WARMUP=1
+export SGLANG_RADIX_FORCE_MISS=1
+export SGLANG_DEFAULT_THINKING=1
+export SGLANG_DSV4_REASONING_EFFORT=max
 export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1
 
 SERVER_LOG="$PWD/server.log"
@@ -46,9 +49,25 @@ fi
 
 start_gpu_monitor --output "$PWD/gpu_metrics.csv"
 
+# ─── DP-attention env vars (GB300-aligned) ───────────────────────────────────
+# Shared across all DP-attention profiles (conc >= 512). Set before per-conc
+# tuning so individual blocks only carry NVSHMEM / batch-size overrides.
+if [ "$CONC" != "1" ] && [ "$CONC" != "32" ]; then
+    export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1
+    export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1
+    export SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_FP4_ACTS=1
+    export SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_MXF4_KIND=1
+    export SGLANG_OPT_USE_ONLINE_COMPRESS=1
+    export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8192
+    export SGLANG_LOG_FORWARD_ITERS=1
+    export SGLANG_LOG_MS=1
+    export SGLANG_REQUEST_STATE_WAIT_TIMEOUT=60
+    export SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION=8
+fi
+
 # ─── Per-concurrency launch profile ──────────────────────────────────────────
 # Each block sets: PARALLEL_ARGS, MEM_FRACTION_STATIC, SWA_FULL_TOKENS_RATIO,
-# and optionally MAX_RUNNING_REQUESTS plus profile-specific env vars.
+# and optionally MAX_RUNNING_REQUESTS.
 #
 # SWA ratio: 1k inputs need more SWA cache headroom than 8k inputs; 0.5 was
 # tuned empirically for the 1k1k recipe, while 0.1 is the cookbook default.
@@ -61,11 +80,11 @@ if [ "$CONC" = "1" ] || [ "$CONC" = "32" ]; then
         --moe-runner-backend flashinfer_mxfp4
         --chunked-prefill-size 8192
         --disable-flashinfer-autotune
+        --enable-deepseek-v4-fp4-indexer
     )
 
 elif [ "$CONC" = "512" ]; then
     # DP attention, flashinfer_mxfp4
-    export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1
     MEM_FRACTION_STATIC=0.94
     SWA_FULL_TOKENS_RATIO=$([[ "$ISL" == "1024" ]] && echo 0.5 || echo 0.1)
     PARALLEL_ARGS=(
@@ -75,15 +94,12 @@ elif [ "$CONC" = "512" ]; then
         --disable-flashinfer-autotune
         --chunked-prefill-size 16384
         --enable-prefill-delayer
+        --enable-deepseek-v4-fp4-indexer
     )
 
 elif [ "$CONC" = "2048" ]; then
     # DP attention, megamoe
-    export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1
     export NVSHMEM_DISABLE_IB=1
-    export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1
-    export SGLANG_LOG_FORWARD_ITERS=1
-    export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8320
     MEM_FRACTION_STATIC=0.87
     SWA_FULL_TOKENS_RATIO=0.06
     MAX_RUNNING_REQUESTS=2560
@@ -95,14 +111,12 @@ elif [ "$CONC" = "2048" ]; then
         --chunked-prefill-size 65536
         --tokenizer-worker-num 4
         --enable-prefill-delayer
+        --enable-deepseek-v4-fp4-indexer
     )
 
 elif [ "$CONC" = "4096" ]; then
     # DP attention, megamoe
-    export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1
     export NVSHMEM_DISABLE_IB=1
-    export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1
-    export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8320
     MEM_FRACTION_STATIC=0.835
     SWA_FULL_TOKENS_RATIO=0.075
     MAX_RUNNING_REQUESTS=4352
@@ -115,15 +129,12 @@ elif [ "$CONC" = "4096" ]; then
         --tokenizer-worker-num 8
         --enable-prefill-delayer
         --decode-log-interval 5
+        --enable-deepseek-v4-fp4-indexer
     )
 
 elif [ "$CONC" = "8192" ]; then
     # DP attention, megamoe
-    export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1
     export NVSHMEM_DISABLE_IB=1
-    export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1
-    export SGLANG_OPT_USE_ONLINE_COMPRESS=1
-    export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8256
     MEM_FRACTION_STATIC=0.80
     SWA_FULL_TOKENS_RATIO=0.3
     MAX_RUNNING_REQUESTS=8192
@@ -136,6 +147,7 @@ elif [ "$CONC" = "8192" ]; then
         --tokenizer-worker-num 16
         --enable-prefill-delayer
         --stream-interval 30
+        --enable-deepseek-v4-fp4-indexer
     )
 
 else

From 3ab8dc9286c8f4eb224802cca8602a73f93eec9f Mon Sep 17 00:00:00 2001
From: yhyang201 <yhyang201@gmail.com>
Date: Mon, 8 Jun 2026 16:54:25 +0800
Subject: [PATCH 02/21] dsv4-fp4-b300-sglang: bump image to
 nightly-dev-cu13-20260608-303757cc

---
 .github/configs/nvidia-master.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index a02749d4d..f6374402c 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -2003,7 +2003,7 @@ dsr1-fp8-b300-sglang:
 # DeepSeek-V4-Pro on B300 with sglang (non-MTP).
 # Uses nightly image with megamoe backend for high-concurrency profiles.
 dsv4-fp4-b300-sglang:
-  image: lmsysorg/sglang:nightly-dev-cu13-20260529-a8cfae0b
+  image: lmsysorg/sglang:nightly-dev-cu13-20260608-303757cc
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: b300

From 250358c5189415d045e62aa3fc599d15493fd651 Mon Sep 17 00:00:00 2001
From: yhyang201 <yhyang201@gmail.com>
Date: Mon, 8 Jun 2026 17:25:10 +0800
Subject: [PATCH 03/21] Add perf-changelog entry for dsv4-fp4-b300-sglang env
 var alignment

---
 perf-changelog.yaml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 5622173f1..a4eebe53d 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3531,3 +3531,11 @@
     - "The Rust frontend replaces only the Python serving/API layer (HTTP, tokenization, scheduling glue, detokenization) and spawns the same Python EngineCore, so GPU kernels/attention/MoE GEMM/KV cache are untouched"
     - "A/B sweep (28 single-node points, 1k1k + 8k1k, TP 1/2/4) vs the Python-frontend baseline (run 26696260751): throughput Pareto-neutral (peak tok/s/GPU within <1.5%, frontiers coincident) and TPOT flat (+-0.5%); TTFT improves ~8% at 1k1k and ~22% at 8k1k (every point), the expected signature of lower frontend CPU latency before first token, scaling with input length"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1634
+
+- config-keys:
+    - dsv4-fp4-b300-sglang
+  description:
+    - "Align env vars to GB300: replace PRECOMPILE=0 with FAST_WARMUP=1, add RADIX_FORCE_MISS, DEFAULT_THINKING, DSV4_REASONING_EFFORT=max"
+    - "Add shared DP-attention env vars: MEGA_MOE_USE_FP4_ACTS, USE_MXF4_KIND, USE_ONLINE_COMPRESS, unified NUM_MAX_TOKENS_PER_RANK=8192"
+    - "Bump image to nightly-dev-cu13-20260601-373cadc9"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1682

From 96b2462b3ed3996f5128b4e4c0e03936377718df Mon Sep 17 00:00:00 2001
From: yhyang201 <yhyang201@gmail.com>
Date: Tue, 9 Jun 2026 00:39:34 +0800
Subject: [PATCH 04/21] dsv4-fp4-b300-sglang: switch to
 nightly-dev-cu13-20260606-b3e4c204

---
 .github/configs/nvidia-master.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index f6374402c..51ea64ff4 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -2003,7 +2003,7 @@ dsr1-fp8-b300-sglang:
 # DeepSeek-V4-Pro on B300 with sglang (non-MTP).
 # Uses nightly image with megamoe backend for high-concurrency profiles.
 dsv4-fp4-b300-sglang:
-  image: lmsysorg/sglang:nightly-dev-cu13-20260608-303757cc
+  image: lmsysorg/sglang:nightly-dev-cu13-20260606-b3e4c204
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: b300

From 9d8e10dff591d9dfd3c1bcb17632563eedbb73a8 Mon Sep 17 00:00:00 2001
From: yhyang201 <yhyang201@gmail.com>
Date: Tue, 9 Jun 2026 01:19:51 +0800
Subject: [PATCH 05/21] dsv4-fp4-b300-sglang: switch to
 nightly-dev-cu13-20260604-14ed9b44

---
 .github/configs/nvidia-master.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 51ea64ff4..bc6b5effb 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -2003,7 +2003,7 @@ dsr1-fp8-b300-sglang:
 # DeepSeek-V4-Pro on B300 with sglang (non-MTP).
 # Uses nightly image with megamoe backend for high-concurrency profiles.
 dsv4-fp4-b300-sglang:
-  image: lmsysorg/sglang:nightly-dev-cu13-20260606-b3e4c204
+  image: lmsysorg/sglang:nightly-dev-cu13-20260604-14ed9b44
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: b300

From 50d1c91fdc6653a16a779dc8cdefe3a8fdc70c28 Mon Sep 17 00:00:00 2001
From: yhyang201 <yhyang201@gmail.com>
Date: Tue, 9 Jun 2026 02:00:01 +0800
Subject: [PATCH 06/21] dsv4-fp4-b300-sglang: switch to
 nightly-dev-cu13-20260601-373cadc9

---
 .github/configs/nvidia-master.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index bc6b5effb..d204aa7c1 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -2003,7 +2003,7 @@ dsr1-fp8-b300-sglang:
 # DeepSeek-V4-Pro on B300 with sglang (non-MTP).
 # Uses nightly image with megamoe backend for high-concurrency profiles.
 dsv4-fp4-b300-sglang:
-  image: lmsysorg/sglang:nightly-dev-cu13-20260604-14ed9b44
+  image: lmsysorg/sglang:nightly-dev-cu13-20260601-373cadc9
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: b300

From 1025c7fabbba0898a1445ed056ab7915cf4ed49d Mon Sep 17 00:00:00 2001
From: yhyang201 <yhyang201@gmail.com>
Date: Tue, 9 Jun 2026 07:03:48 +0800
Subject: [PATCH 07/21] dsv4-fp4-b300-sglang: remove
 --enable-deepseek-v4-fp4-indexer

---
 .../single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh  | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh
index 2079d4165..7295f353c 100755
--- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh
@@ -80,7 +80,7 @@ if [ "$CONC" = "1" ] || [ "$CONC" = "32" ]; then
         --moe-runner-backend flashinfer_mxfp4
         --chunked-prefill-size 8192
         --disable-flashinfer-autotune
-        --enable-deepseek-v4-fp4-indexer
+
     )
 
 elif [ "$CONC" = "512" ]; then
@@ -94,7 +94,7 @@ elif [ "$CONC" = "512" ]; then
         --disable-flashinfer-autotune
         --chunked-prefill-size 16384
         --enable-prefill-delayer
-        --enable-deepseek-v4-fp4-indexer
+
     )
 
 elif [ "$CONC" = "2048" ]; then
@@ -111,7 +111,7 @@ elif [ "$CONC" = "2048" ]; then
         --chunked-prefill-size 65536
         --tokenizer-worker-num 4
         --enable-prefill-delayer
-        --enable-deepseek-v4-fp4-indexer
+
     )
 
 elif [ "$CONC" = "4096" ]; then
@@ -129,7 +129,7 @@ elif [ "$CONC" = "4096" ]; then
         --tokenizer-worker-num 8
         --enable-prefill-delayer
         --decode-log-interval 5
-        --enable-deepseek-v4-fp4-indexer
+
     )
 
 elif [ "$CONC" = "8192" ]; then
@@ -147,7 +147,7 @@ elif [ "$CONC" = "8192" ]; then
         --tokenizer-worker-num 16
         --enable-prefill-delayer
         --stream-interval 30
-        --enable-deepseek-v4-fp4-indexer
+
     )
 
 else

From 0e02411b00e5a7307aeea69b09ba9bd9a7e1b286 Mon Sep 17 00:00:00 2001
From: yhyang201 <yhyang201@gmail.com>
Date: Tue, 9 Jun 2026 07:40:36 +0800
Subject: [PATCH 08/21] dsv4-fp4-b300-sglang: revert image to original
 nightly-dev-cu13-20260529-a8cfae0b

---
 .github/configs/nvidia-master.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index d204aa7c1..a02749d4d 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -2003,7 +2003,7 @@ dsr1-fp8-b300-sglang:
 # DeepSeek-V4-Pro on B300 with sglang (non-MTP).
 # Uses nightly image with megamoe backend for high-concurrency profiles.
 dsv4-fp4-b300-sglang:
-  image: lmsysorg/sglang:nightly-dev-cu13-20260601-373cadc9
+  image: lmsysorg/sglang:nightly-dev-cu13-20260529-a8cfae0b
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: b300

From c65520a787a92d00a7e068e40d8fd78302103f8f Mon Sep 17 00:00:00 2001
From: yhyang201 <yhyang201@gmail.com>
Date: Tue, 9 Jun 2026 11:13:26 +0800
Subject: [PATCH 09/21] dsv4-fp4-b300-sglang: switch to
 nightly-dev-cu13-20260609-317fc6a9

---
 .github/configs/nvidia-master.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index a02749d4d..56b11c889 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -2003,7 +2003,7 @@ dsr1-fp8-b300-sglang:
 # DeepSeek-V4-Pro on B300 with sglang (non-MTP).
 # Uses nightly image with megamoe backend for high-concurrency profiles.
 dsv4-fp4-b300-sglang:
-  image: lmsysorg/sglang:nightly-dev-cu13-20260529-a8cfae0b
+  image: lmsysorg/sglang:nightly-dev-cu13-20260609-317fc6a9
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: b300

From 6beee8c23f232ef6a6a2ef2fe719ef081afd9773 Mon Sep 17 00:00:00 2001
From: yhyang201 <yhyang201@gmail.com>
Date: Tue, 9 Jun 2026 14:06:03 +0800
Subject: [PATCH 10/21] dsv4-fp4-b300-sglang: kill stale processes on server
 ports before launch

---
 benchmarks/benchmark_lib.sh                   | 54 +++++++++++++++++++
 .../fixed_seq_len/dsv4_fp4_b300_sglang.sh     |  2 +
 2 files changed, 56 insertions(+)

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index e3080b4bf..95c8d7217 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -78,6 +78,60 @@ stop_gpu_monitor() {
 # Check if required environment variables are set
 # Usage: check_env_vars VAR1 VAR2 VAR3 ...
 # Exits with code 1 if any variable is not set
+# Kill any process listening on the given TCP port.
+# Usage: kill_port_users PORT [PORT ...]
+# Silently succeeds if the port is already free.
+kill_port_users() {
+    for port in "$@"; do
+        local pids
+        # lsof is the most portable way; fall back to fuser / ss+awk.
+        if command -v lsof >/dev/null 2>&1; then
+            pids=$(lsof -ti "tcp:$port" 2>/dev/null || true)
+        elif command -v fuser >/dev/null 2>&1; then
+            pids=$(fuser "$port/tcp" 2>/dev/null | tr -s ' ' '\n' || true)
+        elif command -v ss >/dev/null 2>&1; then
+            pids=$(ss -tlnp "sport = :$port" 2>/dev/null \
+                   | awk -F'pid=' 'NR>1 && $2{split($2,a,","); print a[1]}' || true)
+        fi
+
+        pids=$(echo "$pids" | xargs)  # trim whitespace
+        if [[ -z "$pids" ]]; then
+            continue
+        fi
+
+        echo "[cleanup] Killing process(es) on port $port: $pids"
+        # SIGTERM first, give 3s, then SIGKILL
+        kill $pids 2>/dev/null || true
+        for _i in $(seq 1 6); do
+            if ! lsof -ti "tcp:$port" >/dev/null 2>&1 && \
+               ! fuser "$port/tcp" >/dev/null 2>&1; then
+                break
+            fi
+            sleep 0.5
+        done
+        kill -9 $pids 2>/dev/null || true
+    done
+}
+
+# Kill stale processes on the server port and the range of ZMQ/NCCL ports
+# that sglang derives from it (port+233 … port+239). Call before launching
+# the inference server to avoid "port not available" errors from leftover
+# processes of a prior run.
+# Usage: cleanup_server_ports [PORT]   (defaults to $PORT / 8888)
+cleanup_server_ports() {
+    local base="${1:-${PORT:-8888}}"
+    local zmq_base=$((base + 234))  # port + ZMQ_TCP_PORT_DELTA(233) + 1
+    local ports=("$base")
+    for offset in $(seq 0 5); do
+        ports+=($((zmq_base + offset)))
+    done
+    # Also kill the NCCL port range (base + 234 + 100 … +108 for tp=8)
+    for offset in $(seq 100 108); do
+        ports+=($((zmq_base + offset)))
+    done
+    kill_port_users "${ports[@]}"
+}
+
 check_env_vars() {
     local missing_vars=()
 
diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh
index 7295f353c..447cef0c6 100755
--- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh
@@ -163,6 +163,8 @@ fi
     echo "==================================="
 } | tee "$SERVER_LOG"
 
+cleanup_server_ports
+
 set -x
 PYTHONNOUSERSITE=1 sglang serve \
     --model-path $MODEL_PATH --served-model-name $MODEL \

From 5bd18adc9f982b19b2b985a5b4b6d2e0e3d96c16 Mon Sep 17 00:00:00 2001
From: yhyang201 <yhyang201@gmail.com>
Date: Tue, 9 Jun 2026 14:30:32 +0800
Subject: [PATCH 11/21] benchmark: kill stale server processes before launch

DP-attention mode uses deterministic TCP ports derived from $PORT.
If a previous sglang process didn't exit cleanly (e.g. Slurm scancel
didn't fully propagate SIGTERM to all subprocesses), the next launch
fails with "metrics_port at 9125 is not available". Non-DP-attention
mode is unaffected because it uses ipc:// with random temp paths.

Add kill_stale_servers() to benchmark_lib.sh and call it from all
sglang benchmark scripts that use DP-attention (B300 and B200).
---
 benchmarks/benchmark_lib.sh                   | 22 +++++++++++++++++++
 .../fixed_seq_len/dsv4_fp4_b200.sh            |  2 ++
 .../fixed_seq_len/dsv4_fp4_b300_sglang.sh     |  2 +-
 .../fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh |  2 ++
 4 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 95c8d7217..f6509c865 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -16,6 +16,28 @@ mkdir -p "$PYTHONPYCACHEPREFIX" 2>/dev/null || true
 # nothing upstream set it.
 export PORT="${PORT:-8888}"
 
+# --------------------------------
+# Stale server cleanup
+# --------------------------------
+# Kill leftover inference-server processes from a prior run on this node.
+# DP-attention mode derives deterministic TCP ports from $PORT, so a stale
+# sglang process that didn't die cleanly will block the next launch.
+# This runs on the compute node (inside srun), where the processes live.
+kill_stale_servers() {
+    echo "[Cleanup] Killing stale inference-server processes ..."
+    # Kill by port: main server port and DP-attention ZMQ ports (port+234..port+238)
+    local _port
+    for _port in "$PORT" $(seq $((PORT + 234)) $((PORT + 238))); do
+        fuser -k "$_port/tcp" 2>/dev/null || true
+    done
+    # Belt-and-suspenders: kill any remaining sglang/vllm serve processes
+    pkill -9 -f "sglang serve" 2>/dev/null || true
+    pkill -9 -f "sglang.srt" 2>/dev/null || true
+    pkill -9 -f "vllm serve" 2>/dev/null || true
+    sleep 2
+    echo "[Cleanup] Done."
+}
+
 # --------------------------------
 # GPU monitoring helpers
 # --------------------------------
diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200.sh
index e1d031854..6c0fee403 100755
--- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200.sh
@@ -82,6 +82,8 @@ fi
     echo "==================================="
 } | tee "$SERVER_LOG"
 
+kill_stale_servers
+
 set -x
 PYTHONNOUSERSITE=1 sglang serve \
     --model-path $MODEL \
diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh
index 447cef0c6..042602074 100755
--- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh
@@ -163,7 +163,7 @@ fi
     echo "==================================="
 } | tee "$SERVER_LOG"
 
-cleanup_server_ports
+kill_stale_servers
 
 set -x
 PYTHONNOUSERSITE=1 sglang serve \
diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh
index 672d48f4b..f2c25148b 100755
--- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh
@@ -124,6 +124,8 @@ fi
     echo "==================================="
 } | tee "$SERVER_LOG"
 
+kill_stale_servers
+
 set -x
 PYTHONNOUSERSITE=1 sglang serve \
     --model-path $MODEL_PATH --served-model-name $MODEL \

From c27f53faefe81ea1db4cc7dafd25d1f8b37c23ff Mon Sep 17 00:00:00 2001
From: yhyang201 <yhyang201@gmail.com>
Date: Tue, 9 Jun 2026 16:28:40 +0800
Subject: [PATCH 12/21] benchmark: use python3+psutil to kill stale server by
 port

fuser/pkill may not be available in all containers. Use python3+psutil
(always present in sglang/vllm images) to precisely kill processes on
$PORT and $PORT+237 (the DP-attention metrics_port = PORT + ZMQ_TCP_PORT_DELTA + 4).
---
 benchmarks/benchmark_lib.sh | 31 +++++++++++++++++++------------
 1 file changed, 19 insertions(+), 12 deletions(-)

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index f6509c865..e44c19f9f 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -20,20 +20,27 @@ export PORT="${PORT:-8888}"
 # Stale server cleanup
 # --------------------------------
 # Kill leftover inference-server processes from a prior run on this node.
-# DP-attention mode derives deterministic TCP ports from $PORT, so a stale
-# sglang process that didn't die cleanly will block the next launch.
-# This runs on the compute node (inside srun), where the processes live.
+# DP-attention mode derives deterministic TCP ports from $PORT:
+#   metrics_port = PORT + 233 (ZMQ_TCP_PORT_DELTA) + 1 + 3 = PORT + 237
+# A stale sglang process holding that port blocks the next launch.
 kill_stale_servers() {
     echo "[Cleanup] Killing stale inference-server processes ..."
-    # Kill by port: main server port and DP-attention ZMQ ports (port+234..port+238)
-    local _port
-    for _port in "$PORT" $(seq $((PORT + 234)) $((PORT + 238))); do
-        fuser -k "$_port/tcp" 2>/dev/null || true
-    done
-    # Belt-and-suspenders: kill any remaining sglang/vllm serve processes
-    pkill -9 -f "sglang serve" 2>/dev/null || true
-    pkill -9 -f "sglang.srt" 2>/dev/null || true
-    pkill -9 -f "vllm serve" 2>/dev/null || true
+    # Use python3+psutil (always present in sglang/vllm images) to find and
+    # kill any process listening on our server port or the derived metrics port.
+    python3 -c "
+import os, signal, psutil
+targets = {int(os.environ.get('PORT', 8888))}
+targets.add(min(targets) + 237)          # metrics_port
+killed = set()
+for c in psutil.net_connections('inet'):
+    if c.laddr.port in targets and c.pid and c.pid not in killed:
+        try:
+            os.kill(c.pid, signal.SIGKILL)
+            killed.add(c.pid)
+            print(f'  killed pid {c.pid} (port {c.laddr.port})')
+        except OSError:
+            pass
+" 2>/dev/null || true
     sleep 2
     echo "[Cleanup] Done."
 }

From cd2f55f712585015822903f9fe7b11fbe79f6fe2 Mon Sep 17 00:00:00 2001
From: yhyang201 <yhyang201@gmail.com>
Date: Tue, 9 Jun 2026 16:56:42 +0800
Subject: [PATCH 13/21] fix stale process cleanup: run pkill on host, not
 inside container

The previous kill_stale_servers ran inside the Pyxis container, which
cannot see or kill processes in the host PID namespace. Move the
cleanup to launch_b300-nv.sh as a bare srun (no --container-image)
so pkill runs directly on the compute node and can kill leftover
sglang/vllm processes from prior container runs.
---
 benchmarks/benchmark_lib.sh                   | 29 -------------------
 .../fixed_seq_len/dsv4_fp4_b200.sh            |  2 --
 .../fixed_seq_len/dsv4_fp4_b300_sglang.sh     |  2 --
 .../fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh |  2 --
 runners/launch_b300-nv.sh                     |  7 +++++
 5 files changed, 7 insertions(+), 35 deletions(-)

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index e44c19f9f..95c8d7217 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -16,35 +16,6 @@ mkdir -p "$PYTHONPYCACHEPREFIX" 2>/dev/null || true
 # nothing upstream set it.
 export PORT="${PORT:-8888}"
 
-# --------------------------------
-# Stale server cleanup
-# --------------------------------
-# Kill leftover inference-server processes from a prior run on this node.
-# DP-attention mode derives deterministic TCP ports from $PORT:
-#   metrics_port = PORT + 233 (ZMQ_TCP_PORT_DELTA) + 1 + 3 = PORT + 237
-# A stale sglang process holding that port blocks the next launch.
-kill_stale_servers() {
-    echo "[Cleanup] Killing stale inference-server processes ..."
-    # Use python3+psutil (always present in sglang/vllm images) to find and
-    # kill any process listening on our server port or the derived metrics port.
-    python3 -c "
-import os, signal, psutil
-targets = {int(os.environ.get('PORT', 8888))}
-targets.add(min(targets) + 237)          # metrics_port
-killed = set()
-for c in psutil.net_connections('inet'):
-    if c.laddr.port in targets and c.pid and c.pid not in killed:
-        try:
-            os.kill(c.pid, signal.SIGKILL)
-            killed.add(c.pid)
-            print(f'  killed pid {c.pid} (port {c.laddr.port})')
-        except OSError:
-            pass
-" 2>/dev/null || true
-    sleep 2
-    echo "[Cleanup] Done."
-}
-
 # --------------------------------
 # GPU monitoring helpers
 # --------------------------------
diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200.sh
index 6c0fee403..e1d031854 100755
--- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200.sh
@@ -82,8 +82,6 @@ fi
     echo "==================================="
 } | tee "$SERVER_LOG"
 
-kill_stale_servers
-
 set -x
 PYTHONNOUSERSITE=1 sglang serve \
     --model-path $MODEL \
diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh
index 042602074..7295f353c 100755
--- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh
@@ -163,8 +163,6 @@ fi
     echo "==================================="
 } | tee "$SERVER_LOG"
 
-kill_stale_servers
-
 set -x
 PYTHONNOUSERSITE=1 sglang serve \
     --model-path $MODEL_PATH --served-model-name $MODEL \
diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh
index f2c25148b..672d48f4b 100755
--- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh
@@ -124,8 +124,6 @@ fi
     echo "==================================="
 } | tee "$SERVER_LOG"
 
-kill_stale_servers
-
 set -x
 PYTHONNOUSERSITE=1 sglang serve \
     --model-path $MODEL_PATH --served-model-name $MODEL \
diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh
index fc0ac297f..f60dae907 100644
--- a/runners/launch_b300-nv.sh
+++ b/runners/launch_b300-nv.sh
@@ -408,6 +408,13 @@ else
     salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT -N 1 --gres=gpu:$TP --exclusive --time="${SALLOC_TIME_LIMIT:-180}" --no-shell --job-name="$RUNNER_NAME"
     JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1)
 
+    # Kill stale sglang/vllm processes on the compute node (outside container).
+    # DP-attention uses deterministic TCP ports; a leftover process blocks the
+    # next launch with "metrics_port not available".
+    srun --jobid=$JOB_ID --mpi=none \
+        bash -c 'echo "[Host cleanup] killing stale inference processes ..."; pkill -9 -f "sglang" 2>/dev/null; pkill -9 -f "vllm" 2>/dev/null; sleep 2; echo "[Host cleanup] done."' \
+        || true
+
     srun --jobid=$JOB_ID \
         --mpi=none \
         --container-image=$SQUASH_FILE \

From 276647bd63c1a775d52757c131701b2e3065005d Mon Sep 17 00:00:00 2001
From: yhyang201 <yhyang201@gmail.com>
Date: Tue, 9 Jun 2026 17:10:28 +0800
Subject: [PATCH 14/21] b300: fix pkill self-kill and change PORT to 30000

Two fixes:
- pkill -f "sglang" matched its own bash -c argv and killed itself
  before reaching the stale process. Use [s]glang regex trick so
  pkill skips its own process.
- Change PORT from 8888 to 30000 to avoid conflict with stale
  processes still bound to 8888-derived ports (metrics_port 9125).
---
 runners/launch_b300-nv.sh | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh
index f60dae907..e1088d8be 100644
--- a/runners/launch_b300-nv.sh
+++ b/runners/launch_b300-nv.sh
@@ -411,8 +411,10 @@ else
     # Kill stale sglang/vllm processes on the compute node (outside container).
     # DP-attention uses deterministic TCP ports; a leftover process blocks the
     # next launch with "metrics_port not available".
+    # [s]glang trick: the regex [s]glang matches "sglang" but not the literal
+    # "[s]glang" in pkill's own argv, preventing pkill from killing itself.
     srun --jobid=$JOB_ID --mpi=none \
-        bash -c 'echo "[Host cleanup] killing stale inference processes ..."; pkill -9 -f "sglang" 2>/dev/null; pkill -9 -f "vllm" 2>/dev/null; sleep 2; echo "[Host cleanup] done."' \
+        bash -c 'echo "[Host cleanup] killing stale inference processes ..."; pkill -9 -f "[s]glang" 2>/dev/null; pkill -9 -f "[v]llm" 2>/dev/null; sleep 2; echo "[Host cleanup] done."' \
         || true
 
     srun --jobid=$JOB_ID \
@@ -421,7 +423,7 @@ else
         --container-mounts=$GITHUB_WORKSPACE:$CONTAINER_MOUNT_DIR,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE_MOUNT,$WRITABLE_MODELS_DIR:$WRITABLE_MODELS_DIR \
         --no-container-mount-home \
         --container-workdir=$CONTAINER_MOUNT_DIR \
-        --no-container-entrypoint --export=ALL,PORT=8888 \
+        --no-container-entrypoint --export=ALL,PORT=30000 \
         bash "$BENCH_SCRIPT"
 
 fi

From 27d5fe9ebfb886121d0bff8491cbe005f08bf902 Mon Sep 17 00:00:00 2001
From: yhyang201 <yhyang201@gmail.com>
Date: Tue, 9 Jun 2026 19:14:37 +0800
Subject: [PATCH 15/21] dsv4-fp4-b300-sglang: remove kill logic, add
 --enable-deepseek-v4-fp4-indexer

---
 .../single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh  | 10 +++++-----
 runners/launch_b300-nv.sh                              |  9 ---------
 2 files changed, 5 insertions(+), 14 deletions(-)

diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh
index 7295f353c..2079d4165 100755
--- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh
@@ -80,7 +80,7 @@ if [ "$CONC" = "1" ] || [ "$CONC" = "32" ]; then
         --moe-runner-backend flashinfer_mxfp4
         --chunked-prefill-size 8192
         --disable-flashinfer-autotune
-
+        --enable-deepseek-v4-fp4-indexer
     )
 
 elif [ "$CONC" = "512" ]; then
@@ -94,7 +94,7 @@ elif [ "$CONC" = "512" ]; then
         --disable-flashinfer-autotune
         --chunked-prefill-size 16384
         --enable-prefill-delayer
-
+        --enable-deepseek-v4-fp4-indexer
     )
 
 elif [ "$CONC" = "2048" ]; then
@@ -111,7 +111,7 @@ elif [ "$CONC" = "2048" ]; then
         --chunked-prefill-size 65536
         --tokenizer-worker-num 4
         --enable-prefill-delayer
-
+        --enable-deepseek-v4-fp4-indexer
     )
 
 elif [ "$CONC" = "4096" ]; then
@@ -129,7 +129,7 @@ elif [ "$CONC" = "4096" ]; then
         --tokenizer-worker-num 8
         --enable-prefill-delayer
         --decode-log-interval 5
-
+        --enable-deepseek-v4-fp4-indexer
     )
 
 elif [ "$CONC" = "8192" ]; then
@@ -147,7 +147,7 @@ elif [ "$CONC" = "8192" ]; then
         --tokenizer-worker-num 16
         --enable-prefill-delayer
         --stream-interval 30
-
+        --enable-deepseek-v4-fp4-indexer
     )
 
 else
diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh
index e1088d8be..665160fd3 100644
--- a/runners/launch_b300-nv.sh
+++ b/runners/launch_b300-nv.sh
@@ -408,15 +408,6 @@ else
     salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT -N 1 --gres=gpu:$TP --exclusive --time="${SALLOC_TIME_LIMIT:-180}" --no-shell --job-name="$RUNNER_NAME"
     JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1)
 
-    # Kill stale sglang/vllm processes on the compute node (outside container).
-    # DP-attention uses deterministic TCP ports; a leftover process blocks the
-    # next launch with "metrics_port not available".
-    # [s]glang trick: the regex [s]glang matches "sglang" but not the literal
-    # "[s]glang" in pkill's own argv, preventing pkill from killing itself.
-    srun --jobid=$JOB_ID --mpi=none \
-        bash -c 'echo "[Host cleanup] killing stale inference processes ..."; pkill -9 -f "[s]glang" 2>/dev/null; pkill -9 -f "[v]llm" 2>/dev/null; sleep 2; echo "[Host cleanup] done."' \
-        || true
-
     srun --jobid=$JOB_ID \
         --mpi=none \
         --container-image=$SQUASH_FILE \

From 30211e40d6f1cf1303eb0ad339ce052627fbd2cb Mon Sep 17 00:00:00 2001
From: yhyang201 <yhyang201@gmail.com>
Date: Tue, 9 Jun 2026 23:42:46 +0800
Subject: [PATCH 16/21] dsv4-fp4-b300-sglang: add
 --enforce-piecewise-cuda-graph --enable-mixed-chunk

---
 .../single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh  | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh
index 2079d4165..6a56d9fc2 100755
--- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh
@@ -81,6 +81,8 @@ if [ "$CONC" = "1" ] || [ "$CONC" = "32" ]; then
         --chunked-prefill-size 8192
         --disable-flashinfer-autotune
         --enable-deepseek-v4-fp4-indexer
+        --enforce-piecewise-cuda-graph
+        --enable-mixed-chunk
     )
 
 elif [ "$CONC" = "512" ]; then
@@ -95,6 +97,8 @@ elif [ "$CONC" = "512" ]; then
         --chunked-prefill-size 16384
         --enable-prefill-delayer
         --enable-deepseek-v4-fp4-indexer
+        --enforce-piecewise-cuda-graph
+        --enable-mixed-chunk
     )
 
 elif [ "$CONC" = "2048" ]; then
@@ -112,6 +116,8 @@ elif [ "$CONC" = "2048" ]; then
         --tokenizer-worker-num 4
         --enable-prefill-delayer
         --enable-deepseek-v4-fp4-indexer
+        --enforce-piecewise-cuda-graph
+        --enable-mixed-chunk
     )
 
 elif [ "$CONC" = "4096" ]; then
@@ -130,6 +136,8 @@ elif [ "$CONC" = "4096" ]; then
         --enable-prefill-delayer
         --decode-log-interval 5
         --enable-deepseek-v4-fp4-indexer
+        --enforce-piecewise-cuda-graph
+        --enable-mixed-chunk
     )
 
 elif [ "$CONC" = "8192" ]; then
@@ -148,6 +156,8 @@ elif [ "$CONC" = "8192" ]; then
         --enable-prefill-delayer
         --stream-interval 30
         --enable-deepseek-v4-fp4-indexer
+        --enforce-piecewise-cuda-graph
+        --enable-mixed-chunk
     )
 
 else

From 48af571260914325a6058cbb7f5f0ada3b6bbb9a Mon Sep 17 00:00:00 2001
From: yhyang201 <yhyang201@gmail.com>
Date: Tue, 9 Jun 2026 23:46:52 +0800
Subject: [PATCH 17/21] perf-changelog: add entry for piecewise cuda graph PR
 #1693

---
 perf-changelog.yaml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index a4eebe53d..920069783 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3539,3 +3539,9 @@
     - "Add shared DP-attention env vars: MEGA_MOE_USE_FP4_ACTS, USE_MXF4_KIND, USE_ONLINE_COMPRESS, unified NUM_MAX_TOKENS_PER_RANK=8192"
     - "Bump image to nightly-dev-cu13-20260601-373cadc9"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1682
+
+- config-keys:
+    - dsv4-fp4-b300-sglang
+  description:
+    - "Add --enforce-piecewise-cuda-graph and --enable-mixed-chunk to all concurrency profiles"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1693

From 38653b9be2abe00bf8a2e27dfe571ac59cfea0b2 Mon Sep 17 00:00:00 2001
From: yhyang201 <yhyang201@gmail.com>
Date: Tue, 9 Jun 2026 23:49:15 +0800
Subject: [PATCH 18/21] benchmark_lib: remove stale
 kill_port_users/cleanup_server_ports

---
 benchmarks/benchmark_lib.sh | 54 -------------------------------------
 1 file changed, 54 deletions(-)

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 95c8d7217..e3080b4bf 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -78,60 +78,6 @@ stop_gpu_monitor() {
 # Check if required environment variables are set
 # Usage: check_env_vars VAR1 VAR2 VAR3 ...
 # Exits with code 1 if any variable is not set
-# Kill any process listening on the given TCP port.
-# Usage: kill_port_users PORT [PORT ...]
-# Silently succeeds if the port is already free.
-kill_port_users() {
-    for port in "$@"; do
-        local pids
-        # lsof is the most portable way; fall back to fuser / ss+awk.
-        if command -v lsof >/dev/null 2>&1; then
-            pids=$(lsof -ti "tcp:$port" 2>/dev/null || true)
-        elif command -v fuser >/dev/null 2>&1; then
-            pids=$(fuser "$port/tcp" 2>/dev/null | tr -s ' ' '\n' || true)
-        elif command -v ss >/dev/null 2>&1; then
-            pids=$(ss -tlnp "sport = :$port" 2>/dev/null \
-                   | awk -F'pid=' 'NR>1 && $2{split($2,a,","); print a[1]}' || true)
-        fi
-
-        pids=$(echo "$pids" | xargs)  # trim whitespace
-        if [[ -z "$pids" ]]; then
-            continue
-        fi
-
-        echo "[cleanup] Killing process(es) on port $port: $pids"
-        # SIGTERM first, give 3s, then SIGKILL
-        kill $pids 2>/dev/null || true
-        for _i in $(seq 1 6); do
-            if ! lsof -ti "tcp:$port" >/dev/null 2>&1 && \
-               ! fuser "$port/tcp" >/dev/null 2>&1; then
-                break
-            fi
-            sleep 0.5
-        done
-        kill -9 $pids 2>/dev/null || true
-    done
-}
-
-# Kill stale processes on the server port and the range of ZMQ/NCCL ports
-# that sglang derives from it (port+233 … port+239). Call before launching
-# the inference server to avoid "port not available" errors from leftover
-# processes of a prior run.
-# Usage: cleanup_server_ports [PORT]   (defaults to $PORT / 8888)
-cleanup_server_ports() {
-    local base="${1:-${PORT:-8888}}"
-    local zmq_base=$((base + 234))  # port + ZMQ_TCP_PORT_DELTA(233) + 1
-    local ports=("$base")
-    for offset in $(seq 0 5); do
-        ports+=($((zmq_base + offset)))
-    done
-    # Also kill the NCCL port range (base + 234 + 100 … +108 for tp=8)
-    for offset in $(seq 100 108); do
-        ports+=($((zmq_base + offset)))
-    done
-    kill_port_users "${ports[@]}"
-}
-
 check_env_vars() {
     local missing_vars=()
 

From abf43499033701ca3c9bcb74a878b66ab14f6c29 Mon Sep 17 00:00:00 2001
From: yhyang201 <yhyang201@gmail.com>
Date: Wed, 10 Jun 2026 00:01:23 +0800
Subject: [PATCH 19/21] dsv4-fp4-b300-sglang: replace
 piecewise-cuda-graph/mixed-chunk with flashinfer-allreduce-fusion

---
 .../fixed_seq_len/dsv4_fp4_b300_sglang.sh         | 15 +++++----------
 perf-changelog.yaml                               |  2 +-
 2 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh
index 6a56d9fc2..fc55899df 100755
--- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh
@@ -81,8 +81,7 @@ if [ "$CONC" = "1" ] || [ "$CONC" = "32" ]; then
         --chunked-prefill-size 8192
         --disable-flashinfer-autotune
         --enable-deepseek-v4-fp4-indexer
-        --enforce-piecewise-cuda-graph
-        --enable-mixed-chunk
+        --enable-flashinfer-allreduce-fusion
     )
 
 elif [ "$CONC" = "512" ]; then
@@ -97,8 +96,7 @@ elif [ "$CONC" = "512" ]; then
         --chunked-prefill-size 16384
         --enable-prefill-delayer
         --enable-deepseek-v4-fp4-indexer
-        --enforce-piecewise-cuda-graph
-        --enable-mixed-chunk
+        --enable-flashinfer-allreduce-fusion
     )
 
 elif [ "$CONC" = "2048" ]; then
@@ -116,8 +114,7 @@ elif [ "$CONC" = "2048" ]; then
         --tokenizer-worker-num 4
         --enable-prefill-delayer
         --enable-deepseek-v4-fp4-indexer
-        --enforce-piecewise-cuda-graph
-        --enable-mixed-chunk
+        --enable-flashinfer-allreduce-fusion
     )
 
 elif [ "$CONC" = "4096" ]; then
@@ -136,8 +133,7 @@ elif [ "$CONC" = "4096" ]; then
         --enable-prefill-delayer
         --decode-log-interval 5
         --enable-deepseek-v4-fp4-indexer
-        --enforce-piecewise-cuda-graph
-        --enable-mixed-chunk
+        --enable-flashinfer-allreduce-fusion
     )
 
 elif [ "$CONC" = "8192" ]; then
@@ -156,8 +152,7 @@ elif [ "$CONC" = "8192" ]; then
         --enable-prefill-delayer
         --stream-interval 30
         --enable-deepseek-v4-fp4-indexer
-        --enforce-piecewise-cuda-graph
-        --enable-mixed-chunk
+        --enable-flashinfer-allreduce-fusion
     )
 
 else
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 920069783..4b3f347d8 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3543,5 +3543,5 @@
 - config-keys:
     - dsv4-fp4-b300-sglang
   description:
-    - "Add --enforce-piecewise-cuda-graph and --enable-mixed-chunk to all concurrency profiles"
+    - "Add --enable-flashinfer-allreduce-fusion to all concurrency profiles"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1693

From f2724b9ad60e597318ae1a7640ef3906c3d2e11f Mon Sep 17 00:00:00 2001
From: yhyang201 <yhyang201@gmail.com>
Date: Wed, 10 Jun 2026 09:14:02 +0800
Subject: [PATCH 20/21] Replace --enable-flashinfer-allreduce-fusion with
 --enable-mixed-chunk, set chunked-prefill-size 16384 for high-conc profiles

---
 .../fixed_seq_len/dsv4_fp4_b300_sglang.sh        | 16 ++++++++--------
 perf-changelog.yaml                              |  2 +-
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh
index fc55899df..2efe2d2e0 100755
--- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh
@@ -81,7 +81,7 @@ if [ "$CONC" = "1" ] || [ "$CONC" = "32" ]; then
         --chunked-prefill-size 8192
         --disable-flashinfer-autotune
         --enable-deepseek-v4-fp4-indexer
-        --enable-flashinfer-allreduce-fusion
+        --enable-mixed-chunk
     )
 
 elif [ "$CONC" = "512" ]; then
@@ -96,7 +96,7 @@ elif [ "$CONC" = "512" ]; then
         --chunked-prefill-size 16384
         --enable-prefill-delayer
         --enable-deepseek-v4-fp4-indexer
-        --enable-flashinfer-allreduce-fusion
+        --enable-mixed-chunk
     )
 
 elif [ "$CONC" = "2048" ]; then
@@ -110,11 +110,11 @@ elif [ "$CONC" = "2048" ]; then
         --enable-dp-attention
         --moe-a2a-backend megamoe
         --cuda-graph-max-bs 288
-        --chunked-prefill-size 65536
+        --chunked-prefill-size 16384
         --tokenizer-worker-num 4
         --enable-prefill-delayer
         --enable-deepseek-v4-fp4-indexer
-        --enable-flashinfer-allreduce-fusion
+        --enable-mixed-chunk
     )
 
 elif [ "$CONC" = "4096" ]; then
@@ -128,12 +128,12 @@ elif [ "$CONC" = "4096" ]; then
         --enable-dp-attention
         --moe-a2a-backend megamoe
         --cuda-graph-max-bs 544
-        --chunked-prefill-size 65536
+        --chunked-prefill-size 16384
         --tokenizer-worker-num 8
         --enable-prefill-delayer
         --decode-log-interval 5
         --enable-deepseek-v4-fp4-indexer
-        --enable-flashinfer-allreduce-fusion
+        --enable-mixed-chunk
     )
 
 elif [ "$CONC" = "8192" ]; then
@@ -147,12 +147,12 @@ elif [ "$CONC" = "8192" ]; then
         --enable-dp-attention
         --moe-a2a-backend megamoe
         --cuda-graph-max-bs 1088
-        --chunked-prefill-size 65536
+        --chunked-prefill-size 16384
         --tokenizer-worker-num 16
         --enable-prefill-delayer
         --stream-interval 30
         --enable-deepseek-v4-fp4-indexer
-        --enable-flashinfer-allreduce-fusion
+        --enable-mixed-chunk
     )
 
 else
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 4b3f347d8..4495de831 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3543,5 +3543,5 @@
 - config-keys:
     - dsv4-fp4-b300-sglang
   description:
-    - "Add --enable-flashinfer-allreduce-fusion to all concurrency profiles"
+    - "Add --enable-mixed-chunk to all concurrency profiles, set chunked-prefill-size 16384 for high-conc megamoe profiles"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1693

From 1bdee896a0ff50e475344e7a9902330bcd0ebfdd Mon Sep 17 00:00:00 2001
From: yhyang201 <yhyang201@gmail.com>
Date: Wed, 10 Jun 2026 12:42:59 +0800
Subject: [PATCH 21/21] Remove CONC=4096 profile from dsv4-fp4-b300-sglang

---
 .github/configs/nvidia-master.yaml            |  1 -
 .../fixed_seq_len/dsv4_fp4_b300_sglang.sh     | 19 -------------------
 2 files changed, 20 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 56b11c889..a2c7f8631 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -2033,7 +2033,6 @@ dsv4-fp4-b300-sglang:
       - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 }
       - { tp: 4, ep: 1, dp-attn: true, conc-start: 512, conc-end: 512 }
       - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 }
-      - { tp: 8, ep: 8, dp-attn: true, conc-start: 4096, conc-end: 4096 }
 
   # DeepSeek-V4-Pro on B300 with EAGLE/MTP speculative decoding. Recipe is
   # selected inside benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh by
diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh
index 2efe2d2e0..a52f2d6f8 100755
--- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh
@@ -117,25 +117,6 @@ elif [ "$CONC" = "2048" ]; then
         --enable-mixed-chunk
     )
 
-elif [ "$CONC" = "4096" ]; then
-    # DP attention, megamoe
-    export NVSHMEM_DISABLE_IB=1
-    MEM_FRACTION_STATIC=0.835
-    SWA_FULL_TOKENS_RATIO=0.075
-    MAX_RUNNING_REQUESTS=4352
-    PARALLEL_ARGS=(
-        --dp-size "$TP"
-        --enable-dp-attention
-        --moe-a2a-backend megamoe
-        --cuda-graph-max-bs 544
-        --chunked-prefill-size 16384
-        --tokenizer-worker-num 8
-        --enable-prefill-delayer
-        --decode-log-interval 5
-        --enable-deepseek-v4-fp4-indexer
-        --enable-mixed-chunk
-    )
-
 elif [ "$CONC" = "8192" ]; then
     # DP attention, megamoe
     export NVSHMEM_DISABLE_IB=1