SemiAnalysisAI · RohitNagraj · Jun 9, 2026 · Jun 9, 2026 · Jun 10, 2026 · Jun 10, 2026
diff --git a/benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_b300.sh b/benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_b300.sh
@@ -38,6 +38,12 @@ nvidia-smi
 export TORCH_CUDA_ARCH_LIST="10.0"
 export PYTHONNOUSERSITE=1
 
+# vLLM v0.20.2+'s CUDA-graph memory profiler pre-reserves a large chunk of GPU
+# memory upfront, which collides with --gpu-memory-utilization=0.90 and shrinks
+# the effective budget left for the KV cache. Disable the profiler so 0.90 means
+# 0.90 (same pattern as benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_b200.sh).
+export VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=0
+
 SERVER_LOG=/workspace/server.log
 
 if [ "${EVAL_ONLY}" = "true" ]; then
@@ -56,7 +62,10 @@ vllm serve $MODEL_PATH --served-model-name $MODEL --host 0.0.0.0 --port $PORT \
 --reasoning-parser kimi_k2 \
 --tool-call-parser kimi_k2 \
 --compilation_config.pass_config.fuse_allreduce_rms true \
---no-enable-prefix-caching \
+--kv-cache-dtype fp8 \
+--max-cudagraph-capture-size 8192 \
+--max-num-batched-tokens 8192 \
+--stream-interval 20 --no-enable-prefix-caching \
 --trust-remote-code > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -3531,3 +3531,9 @@
     - "The Rust frontend replaces only the Python serving/API layer (HTTP, tokenization, scheduling glue, detokenization) and spawns the same Python EngineCore, so GPU kernels/attention/MoE GEMM/KV cache are untouched"
     - "A/B sweep (28 single-node points, 1k1k + 8k1k, TP 1/2/4) vs the Python-frontend baseline (run 26696260751): throughput Pareto-neutral (peak tok/s/GPU within <1.5%, frontiers coincident) and TPOT flat (+-0.5%); TTFT improves ~8% at 1k1k and ~22% at 8k1k (every point), the expected signature of lower frontend CPU latency before first token, scaling with input length"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1634
+
+- config-keys:
+    - kimik2.5-fp4-b300-vllm
+  description:
+    - "Tune the B300 vLLM server launch: set --kv-cache-dtype fp8, --max-cudagraph-capture-size 8192, --max-num-batched-tokens 8192 (matched to the cudagraph capture size), and --stream-interval 20; disable the CUDA-graph memory profiler (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=0) so --gpu-memory-utilization is honored in full."
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1698