From a1eb2ed8e560b6758cd862a7a459e0a949fbc38e Mon Sep 17 00:00:00 2001 From: Rohit Pujar Nagraj Date: Tue, 9 Jun 2026 15:04:08 -0700 Subject: [PATCH 1/4] perf: align kimik2.5-fp4-b300-vllm server launch with B200 recipe Add the tuning args missing on B300 (--kv-cache-dtype fp8, --max-cudagraph-capture-size 2048, --max-num-batched-tokens $((ISL*2)), --stream-interval 20) to match the B200 recipe and close the observed perf gap. Appends a perf-changelog entry. --- benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_b300.sh | 4 ++++ perf-changelog.yaml | 6 ++++++ 2 files changed, 10 insertions(+) diff --git a/benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_b300.sh b/benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_b300.sh index db6d3fb0d..03ab34d0c 100755 --- a/benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_b300.sh +++ b/benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_b300.sh @@ -56,6 +56,10 @@ vllm serve $MODEL_PATH --served-model-name $MODEL --host 0.0.0.0 --port $PORT \ --reasoning-parser kimi_k2 \ --tool-call-parser kimi_k2 \ --compilation_config.pass_config.fuse_allreduce_rms true \ +--kv-cache-dtype fp8 \ +--max-cudagraph-capture-size 2048 \ +--max-num-batched-tokens "$((ISL * 2 ))" \ +--stream-interval 20 \ --no-enable-prefix-caching \ --trust-remote-code > $SERVER_LOG 2>&1 & diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 5622173f1..1f5485e25 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3531,3 +3531,9 @@ - "The Rust frontend replaces only the Python serving/API layer (HTTP, tokenization, scheduling glue, detokenization) and spawns the same Python EngineCore, so GPU kernels/attention/MoE GEMM/KV cache are untouched" - "A/B sweep (28 single-node points, 1k1k + 8k1k, TP 1/2/4) vs the Python-frontend baseline (run 26696260751): throughput Pareto-neutral (peak tok/s/GPU within <1.5%, frontiers coincident) and TPOT flat (+-0.5%); TTFT improves ~8% at 1k1k and ~22% at 8k1k (every point), the expected signature of lower frontend CPU latency before first token, scaling with input length" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1634 + +- config-keys: + - kimik2.5-fp4-b300-vllm + description: + - "Align server launch with the B200 recipe by adding the tuning args that were missing on B300: --kv-cache-dtype fp8, --max-cudagraph-capture-size 2048, --max-num-batched-tokens \"$((ISL * 2))\", and --stream-interval 20." + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/ From 62c377e78d0885e6018014a37458806a071de494 Mon Sep 17 00:00:00 2001 From: Rohit Pujar Nagraj Date: Tue, 9 Jun 2026 15:04:35 -0700 Subject: [PATCH 2/4] chore: set perf-changelog pr-link for kimik2.5-fp4-b300-vllm server tuning --- perf-changelog.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 1f5485e25..46401a26b 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3536,4 +3536,4 @@ - kimik2.5-fp4-b300-vllm description: - "Align server launch with the B200 recipe by adding the tuning args that were missing on B300: --kv-cache-dtype fp8, --max-cudagraph-capture-size 2048, --max-num-batched-tokens \"$((ISL * 2))\", and --stream-interval 20." - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/ + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1698 From 3274c53587fc6a8ac7e7bb96199110a1d859f5cb Mon Sep 17 00:00:00 2001 From: Rohit Pujar Nagraj Date: Tue, 9 Jun 2026 19:20:52 -0700 Subject: [PATCH 3/4] Changed max-cudagraph-capture-size --- benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_b300.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_b300.sh b/benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_b300.sh index 03ab34d0c..9d2f6d5db 100755 --- a/benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_b300.sh +++ b/benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_b300.sh @@ -57,7 +57,7 @@ vllm serve $MODEL_PATH --served-model-name $MODEL --host 0.0.0.0 --port $PORT \ --tool-call-parser kimi_k2 \ --compilation_config.pass_config.fuse_allreduce_rms true \ --kv-cache-dtype fp8 \ ---max-cudagraph-capture-size 2048 \ +--max-cudagraph-capture-size 8192 \ --max-num-batched-tokens "$((ISL * 2 ))" \ --stream-interval 20 \ --no-enable-prefix-caching \ From 993085b56f1dd441c461dd911f402226230fd967 Mon Sep 17 00:00:00 2001 From: Rohit Pujar Nagraj Date: Tue, 9 Jun 2026 20:10:04 -0700 Subject: [PATCH 4/4] perf: tune kimik2.5-fp4-b300-vllm server launch (batched-tokens=cudagraph, disable mem profiler) Set --max-num-batched-tokens 8192 to match --max-cudagraph-capture-size, and disable the CUDA-graph memory profiler (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=0) so --gpu-memory-utilization is honored in full. Updates the perf-changelog entry. --- .../single_node/fixed_seq_len/kimik2.5_fp4_b300.sh | 11 ++++++++--- perf-changelog.yaml | 2 +- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_b300.sh b/benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_b300.sh index 9d2f6d5db..ed4a0e171 100755 --- a/benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_b300.sh +++ b/benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_b300.sh @@ -38,6 +38,12 @@ nvidia-smi export TORCH_CUDA_ARCH_LIST="10.0" export PYTHONNOUSERSITE=1 +# vLLM v0.20.2+'s CUDA-graph memory profiler pre-reserves a large chunk of GPU +# memory upfront, which collides with --gpu-memory-utilization=0.90 and shrinks +# the effective budget left for the KV cache. Disable the profiler so 0.90 means +# 0.90 (same pattern as benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_b200.sh). +export VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=0 + SERVER_LOG=/workspace/server.log if [ "${EVAL_ONLY}" = "true" ]; then @@ -58,9 +64,8 @@ vllm serve $MODEL_PATH --served-model-name $MODEL --host 0.0.0.0 --port $PORT \ --compilation_config.pass_config.fuse_allreduce_rms true \ --kv-cache-dtype fp8 \ --max-cudagraph-capture-size 8192 \ ---max-num-batched-tokens "$((ISL * 2 ))" \ ---stream-interval 20 \ ---no-enable-prefix-caching \ +--max-num-batched-tokens 8192 \ +--stream-interval 20 --no-enable-prefix-caching \ --trust-remote-code > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 46401a26b..4a216390d 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3535,5 +3535,5 @@ - config-keys: - kimik2.5-fp4-b300-vllm description: - - "Align server launch with the B200 recipe by adding the tuning args that were missing on B300: --kv-cache-dtype fp8, --max-cudagraph-capture-size 2048, --max-num-batched-tokens \"$((ISL * 2))\", and --stream-interval 20." + - "Tune the B300 vLLM server launch: set --kv-cache-dtype fp8, --max-cudagraph-capture-size 8192, --max-num-batched-tokens 8192 (matched to the cudagraph capture size), and --stream-interval 20; disable the CUDA-graph memory profiler (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=0) so --gpu-memory-utilization is honored in full." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1698