SemiAnalysisAI · yhyang201 · Jun 8, 2026 · Jun 8, 2026 · Jun 8, 2026 · Jun 8, 2026
@@ -2003,7 +2003,7 @@ dsr1-fp8-b300-sglang:
 # DeepSeek-V4-Pro on B300 with sglang (non-MTP).
 # Uses nightly image with megamoe backend for high-concurrency profiles.
 dsv4-fp4-b300-sglang:
-  image: lmsysorg/sglang:nightly-dev-cu13-20260529-a8cfae0b
+  image: lmsysorg/sglang:nightly-dev-cu13-20260609-317fc6a9
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: b300
@@ -2033,7 +2033,6 @@ dsv4-fp4-b300-sglang:
       - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 }
       - { tp: 4, ep: 1, dp-attn: true, conc-start: 512, conc-end: 512 }
       - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 }
-      - { tp: 8, ep: 8, dp-attn: true, conc-start: 4096, conc-end: 4096 }
 
   # DeepSeek-V4-Pro on B300 with EAGLE/MTP speculative decoding. Recipe is
   # selected inside benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh by

diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh
@@ -30,8 +30,11 @@ fi
 
 nvidia-smi
 
-# ─── Common env vars (all profiles) ───────────────────────────────────────────
-export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0
+# ─── Common env vars (all profiles, GB300-aligned) ──────────────────────────
+export SGLANG_JIT_DEEPGEMM_FAST_WARMUP=1
+export SGLANG_RADIX_FORCE_MISS=1
+export SGLANG_DEFAULT_THINKING=1
+export SGLANG_DSV4_REASONING_EFFORT=max
 export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1
 
 SERVER_LOG="$PWD/server.log"
@@ -46,9 +49,25 @@ fi
 
 start_gpu_monitor --output "$PWD/gpu_metrics.csv"
 
+# ─── DP-attention env vars (GB300-aligned) ───────────────────────────────────
+# Shared across all DP-attention profiles (conc >= 512). Set before per-conc
+# tuning so individual blocks only carry NVSHMEM / batch-size overrides.
+if [ "$CONC" != "1" ] && [ "$CONC" != "32" ]; then
+    export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1
+    export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1
+    export SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_FP4_ACTS=1
+    export SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_MXF4_KIND=1
+    export SGLANG_OPT_USE_ONLINE_COMPRESS=1
+    export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8192
+    export SGLANG_LOG_FORWARD_ITERS=1
+    export SGLANG_LOG_MS=1
+    export SGLANG_REQUEST_STATE_WAIT_TIMEOUT=60
+    export SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION=8
+fi
+
 # ─── Per-concurrency launch profile ──────────────────────────────────────────
 # Each block sets: PARALLEL_ARGS, MEM_FRACTION_STATIC, SWA_FULL_TOKENS_RATIO,
-# and optionally MAX_RUNNING_REQUESTS plus profile-specific env vars.
+# and optionally MAX_RUNNING_REQUESTS.
 #
 # SWA ratio: 1k inputs need more SWA cache headroom than 8k inputs; 0.5 was
 # tuned empirically for the 1k1k recipe, while 0.1 is the cookbook default.
@@ -61,11 +80,12 @@ if [ "$CONC" = "1" ] || [ "$CONC" = "32" ]; then
         --moe-runner-backend flashinfer_mxfp4
         --chunked-prefill-size 8192
         --disable-flashinfer-autotune
+        --enable-deepseek-v4-fp4-indexer
+        --enable-mixed-chunk
     )
 
 elif [ "$CONC" = "512" ]; then
     # DP attention, flashinfer_mxfp4
-    export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1
     MEM_FRACTION_STATIC=0.94
     SWA_FULL_TOKENS_RATIO=$([[ "$ISL" == "1024" ]] && echo 0.5 || echo 0.1)
     PARALLEL_ARGS=(
@@ -75,15 +95,13 @@ elif [ "$CONC" = "512" ]; then
         --disable-flashinfer-autotune
         --chunked-prefill-size 16384
         --enable-prefill-delayer
+        --enable-deepseek-v4-fp4-indexer
+        --enable-mixed-chunk
     )
 
 elif [ "$CONC" = "2048" ]; then
     # DP attention, megamoe
-    export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1
     export NVSHMEM_DISABLE_IB=1
-    export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1
-    export SGLANG_LOG_FORWARD_ITERS=1
-    export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8320
     MEM_FRACTION_STATIC=0.87
     SWA_FULL_TOKENS_RATIO=0.06
     MAX_RUNNING_REQUESTS=2560
@@ -92,38 +110,16 @@ elif [ "$CONC" = "2048" ]; then
         --enable-dp-attention
         --moe-a2a-backend megamoe
         --cuda-graph-max-bs 288
-        --chunked-prefill-size 65536
+        --chunked-prefill-size 16384
         --tokenizer-worker-num 4
         --enable-prefill-delayer
-    )
-
-elif [ "$CONC" = "4096" ]; then
-    # DP attention, megamoe
-    export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1
-    export NVSHMEM_DISABLE_IB=1
-    export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1
-    export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8320
-    MEM_FRACTION_STATIC=0.835
-    SWA_FULL_TOKENS_RATIO=0.075
-    MAX_RUNNING_REQUESTS=4352
-    PARALLEL_ARGS=(
-        --dp-size "$TP"
-        --enable-dp-attention
-        --moe-a2a-backend megamoe
-        --cuda-graph-max-bs 544
-        --chunked-prefill-size 65536
-        --tokenizer-worker-num 8
-        --enable-prefill-delayer
-        --decode-log-interval 5
+        --enable-deepseek-v4-fp4-indexer
+        --enable-mixed-chunk
     )
 
 elif [ "$CONC" = "8192" ]; then
     # DP attention, megamoe
-    export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1
     export NVSHMEM_DISABLE_IB=1
-    export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1
-    export SGLANG_OPT_USE_ONLINE_COMPRESS=1
-    export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8256
     MEM_FRACTION_STATIC=0.80
     SWA_FULL_TOKENS_RATIO=0.3
     MAX_RUNNING_REQUESTS=8192
@@ -132,10 +128,12 @@ elif [ "$CONC" = "8192" ]; then
         --enable-dp-attention
         --moe-a2a-backend megamoe
         --cuda-graph-max-bs 1088
-        --chunked-prefill-size 65536
+        --chunked-prefill-size 16384
         --tokenizer-worker-num 16
         --enable-prefill-delayer
         --stream-interval 30
+        --enable-deepseek-v4-fp4-indexer
+        --enable-mixed-chunk
     )
 
 else

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -3531,3 +3531,17 @@
     - "The Rust frontend replaces only the Python serving/API layer (HTTP, tokenization, scheduling glue, detokenization) and spawns the same Python EngineCore, so GPU kernels/attention/MoE GEMM/KV cache are untouched"
     - "A/B sweep (28 single-node points, 1k1k + 8k1k, TP 1/2/4) vs the Python-frontend baseline (run 26696260751): throughput Pareto-neutral (peak tok/s/GPU within <1.5%, frontiers coincident) and TPOT flat (+-0.5%); TTFT improves ~8% at 1k1k and ~22% at 8k1k (every point), the expected signature of lower frontend CPU latency before first token, scaling with input length"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1634
+
+- config-keys:
+    - dsv4-fp4-b300-sglang
+  description:
+    - "Align env vars to GB300: replace PRECOMPILE=0 with FAST_WARMUP=1, add RADIX_FORCE_MISS, DEFAULT_THINKING, DSV4_REASONING_EFFORT=max"
+    - "Add shared DP-attention env vars: MEGA_MOE_USE_FP4_ACTS, USE_MXF4_KIND, USE_ONLINE_COMPRESS, unified NUM_MAX_TOKENS_PER_RANK=8192"
+    - "Bump image to nightly-dev-cu13-20260601-373cadc9"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1682
+
+- config-keys:
+    - dsv4-fp4-b300-sglang
+  description:
+    - "Add --enable-mixed-chunk to all concurrency profiles, set chunked-prefill-size 16384 for high-conc megamoe profiles"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1693
diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh
@@ -414,7 +414,7 @@ else
         --container-mounts=$GITHUB_WORKSPACE:$CONTAINER_MOUNT_DIR,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE_MOUNT,$WRITABLE_MODELS_DIR:$WRITABLE_MODELS_DIR \
         --no-container-mount-home \
         --container-workdir=$CONTAINER_MOUNT_DIR \
-        --no-container-entrypoint --export=ALL,PORT=8888 \
+        --no-container-entrypoint --export=ALL,PORT=30000 \
         bash "$BENCH_SCRIPT"
 
 fi