diff --git a/benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_b300.sh b/benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_b300.sh index db6d3fb0d..ed4a0e171 100755 --- a/benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_b300.sh +++ b/benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_b300.sh @@ -38,6 +38,12 @@ nvidia-smi export TORCH_CUDA_ARCH_LIST="10.0" export PYTHONNOUSERSITE=1 +# vLLM v0.20.2+'s CUDA-graph memory profiler pre-reserves a large chunk of GPU +# memory upfront, which collides with --gpu-memory-utilization=0.90 and shrinks +# the effective budget left for the KV cache. Disable the profiler so 0.90 means +# 0.90 (same pattern as benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_b200.sh). +export VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=0 + SERVER_LOG=/workspace/server.log if [ "${EVAL_ONLY}" = "true" ]; then @@ -56,7 +62,10 @@ vllm serve $MODEL_PATH --served-model-name $MODEL --host 0.0.0.0 --port $PORT \ --reasoning-parser kimi_k2 \ --tool-call-parser kimi_k2 \ --compilation_config.pass_config.fuse_allreduce_rms true \ ---no-enable-prefix-caching \ +--kv-cache-dtype fp8 \ +--max-cudagraph-capture-size 8192 \ +--max-num-batched-tokens 8192 \ +--stream-interval 20 --no-enable-prefix-caching \ --trust-remote-code > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 5622173f1..4a216390d 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3531,3 +3531,9 @@ - "The Rust frontend replaces only the Python serving/API layer (HTTP, tokenization, scheduling glue, detokenization) and spawns the same Python EngineCore, so GPU kernels/attention/MoE GEMM/KV cache are untouched" - "A/B sweep (28 single-node points, 1k1k + 8k1k, TP 1/2/4) vs the Python-frontend baseline (run 26696260751): throughput Pareto-neutral (peak tok/s/GPU within <1.5%, frontiers coincident) and TPOT flat (+-0.5%); TTFT improves ~8% at 1k1k and ~22% at 8k1k (every point), the expected signature of lower frontend CPU latency before first token, scaling with input length" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1634 + +- config-keys: + - kimik2.5-fp4-b300-vllm + description: + - "Tune the B300 vLLM server launch: set --kv-cache-dtype fp8, --max-cudagraph-capture-size 8192, --max-num-batched-tokens 8192 (matched to the cudagraph capture size), and --stream-interval 20; disable the CUDA-graph memory profiler (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=0) so --gpu-memory-utilization is honored in full." + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1698