From a1eb2ed8e560b6758cd862a7a459e0a949fbc38e Mon Sep 17 00:00:00 2001
From: Rohit Pujar Nagraj <rpujarnagraj@nvidia.com>
Date: Tue, 9 Jun 2026 15:04:08 -0700
Subject: [PATCH 1/4] perf: align kimik2.5-fp4-b300-vllm server launch with
 B200 recipe

Add the tuning args missing on B300 (--kv-cache-dtype fp8,
--max-cudagraph-capture-size 2048, --max-num-batched-tokens $((ISL*2)),
--stream-interval 20) to match the B200 recipe and close the observed
perf gap. Appends a perf-changelog entry.
---
 benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_b300.sh | 4 ++++
 perf-changelog.yaml                                       | 6 ++++++
 2 files changed, 10 insertions(+)
diff --git a/benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_b300.sh b/benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_b300.sh
index db6d3fb0d..03ab34d0c 100755
--- a/benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_b300.sh
+++ b/benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_b300.sh
@@ -56,6 +56,10 @@ vllm serve $MODEL_PATH --served-model-name $MODEL --host 0.0.0.0 --port $PORT \
 --reasoning-parser kimi_k2 \
 --tool-call-parser kimi_k2 \
 --compilation_config.pass_config.fuse_allreduce_rms true \
+--kv-cache-dtype fp8 \
+--max-cudagraph-capture-size 2048 \
+--max-num-batched-tokens "$((ISL * 2 ))" \
+--stream-interval 20 \
 --no-enable-prefix-caching \
 --trust-remote-code > $SERVER_LOG 2>&1 &
 
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 5622173f1..1f5485e25 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3531,3 +3531,9 @@
     - "The Rust frontend replaces only the Python serving/API layer (HTTP, tokenization, scheduling glue, detokenization) and spawns the same Python EngineCore, so GPU kernels/attention/MoE GEMM/KV cache are untouched"
     - "A/B sweep (28 single-node points, 1k1k + 8k1k, TP 1/2/4) vs the Python-frontend baseline (run 26696260751): throughput Pareto-neutral (peak tok/s/GPU within <1.5%, frontiers coincident) and TPOT flat (+-0.5%); TTFT improves ~8% at 1k1k and ~22% at 8k1k (every point), the expected signature of lower frontend CPU latency before first token, scaling with input length"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1634
+
+- config-keys:
+    - kimik2.5-fp4-b300-vllm
+  description:
+    - "Align server launch with the B200 recipe by adding the tuning args that were missing on B300: --kv-cache-dtype fp8, --max-cudagraph-capture-size 2048, --max-num-batched-tokens \"$((ISL * 2))\", and --stream-interval 20."
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/<TODO>

From 62c377e78d0885e6018014a37458806a071de494 Mon Sep 17 00:00:00 2001
From: Rohit Pujar Nagraj <rpujarnagraj@nvidia.com>
Date: Tue, 9 Jun 2026 15:04:35 -0700
Subject: [PATCH 2/4] chore: set perf-changelog pr-link for
 kimik2.5-fp4-b300-vllm server tuning

---
 perf-changelog.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 1f5485e25..46401a26b 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3536,4 +3536,4 @@
     - kimik2.5-fp4-b300-vllm
   description:
     - "Align server launch with the B200 recipe by adding the tuning args that were missing on B300: --kv-cache-dtype fp8, --max-cudagraph-capture-size 2048, --max-num-batched-tokens \"$((ISL * 2))\", and --stream-interval 20."
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/<TODO>
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1698

From 3274c53587fc6a8ac7e7bb96199110a1d859f5cb Mon Sep 17 00:00:00 2001
From: Rohit Pujar Nagraj <rpujarnagraj@nvidia.com>
Date: Tue, 9 Jun 2026 19:20:52 -0700
Subject: [PATCH 3/4] Changed max-cudagraph-capture-size

---
 benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_b300.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_b300.sh b/benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_b300.sh
index 03ab34d0c..9d2f6d5db 100755
--- a/benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_b300.sh
+++ b/benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_b300.sh
@@ -57,7 +57,7 @@ vllm serve $MODEL_PATH --served-model-name $MODEL --host 0.0.0.0 --port $PORT \
 --tool-call-parser kimi_k2 \
 --compilation_config.pass_config.fuse_allreduce_rms true \
 --kv-cache-dtype fp8 \
---max-cudagraph-capture-size 2048 \
+--max-cudagraph-capture-size 8192 \
 --max-num-batched-tokens "$((ISL * 2 ))" \
 --stream-interval 20 \
 --no-enable-prefix-caching \

From 993085b56f1dd441c461dd911f402226230fd967 Mon Sep 17 00:00:00 2001
From: Rohit Pujar Nagraj <rpujarnagraj@nvidia.com>
Date: Tue, 9 Jun 2026 20:10:04 -0700
Subject: [PATCH 4/4] perf: tune kimik2.5-fp4-b300-vllm server launch
 (batched-tokens=cudagraph, disable mem profiler)

Set --max-num-batched-tokens 8192 to match --max-cudagraph-capture-size, and
disable the CUDA-graph memory profiler (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=0)
so --gpu-memory-utilization is honored in full. Updates the perf-changelog entry.
---
 .../single_node/fixed_seq_len/kimik2.5_fp4_b300.sh    | 11 ++++++++---
 perf-changelog.yaml                                   |  2 +-
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_b300.sh b/benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_b300.sh
index 9d2f6d5db..ed4a0e171 100755
--- a/benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_b300.sh
+++ b/benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_b300.sh
@@ -38,6 +38,12 @@ nvidia-smi
 export TORCH_CUDA_ARCH_LIST="10.0"
 export PYTHONNOUSERSITE=1
 
+# vLLM v0.20.2+'s CUDA-graph memory profiler pre-reserves a large chunk of GPU
+# memory upfront, which collides with --gpu-memory-utilization=0.90 and shrinks
+# the effective budget left for the KV cache. Disable the profiler so 0.90 means
+# 0.90 (same pattern as benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_b200.sh).
+export VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=0
+
 SERVER_LOG=/workspace/server.log
 
 if [ "${EVAL_ONLY}" = "true" ]; then
@@ -58,9 +64,8 @@ vllm serve $MODEL_PATH --served-model-name $MODEL --host 0.0.0.0 --port $PORT \
 --compilation_config.pass_config.fuse_allreduce_rms true \
 --kv-cache-dtype fp8 \
 --max-cudagraph-capture-size 8192 \
---max-num-batched-tokens "$((ISL * 2 ))" \
---stream-interval 20 \
---no-enable-prefix-caching \
+--max-num-batched-tokens 8192 \
+--stream-interval 20 --no-enable-prefix-caching \
 --trust-remote-code > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 46401a26b..4a216390d 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3535,5 +3535,5 @@
 - config-keys:
     - kimik2.5-fp4-b300-vllm
   description:
-    - "Align server launch with the B200 recipe by adding the tuning args that were missing on B300: --kv-cache-dtype fp8, --max-cudagraph-capture-size 2048, --max-num-batched-tokens \"$((ISL * 2))\", and --stream-interval 20."
+    - "Tune the B300 vLLM server launch: set --kv-cache-dtype fp8, --max-cudagraph-capture-size 8192, --max-num-batched-tokens 8192 (matched to the cudagraph capture size), and --stream-interval 20; disable the CUDA-graph memory profiler (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=0) so --gpu-memory-utilization is honored in full."
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1698