Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4358,7 +4358,7 @@ minimaxm2.5-fp8-b200-vllm-agentic:
# MiniMax-M2.5 FP8 B200 vLLM recipe as-is until B300-specific tuning is available.

minimaxm2.5-fp8-b300-vllm:
image: vllm/vllm-openai:v0.21.0
image: vllm/vllm-openai:v0.22.0
model: MiniMaxAI/MiniMax-M2.5
model-prefix: minimaxm2.5
runner: b300
Expand Down Expand Up @@ -4462,7 +4462,7 @@ minimaxm2.5-fp4-b200-vllm-agentic:
# MiniMax-M2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available.

minimaxm2.5-fp4-b300-vllm:
image: vllm/vllm-openai:v0.21.0
image: vllm/vllm-openai:v0.22.0
model: nvidia/MiniMax-M2.5-NVFP4
model-prefix: minimaxm2.5
runner: b300
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
SERVER_LOG=/workspace/server.log

export VLLM_FLOAT32_MATMUL_PRECISION=high
export VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=0

if [ "${DP_ATTENTION}" = "true" ]; then
PARALLEL_ARGS="--tensor-parallel-size=1 --data-parallel-size=$TP --enable-expert-parallel"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ nvidia-smi
SERVER_LOG=/workspace/server.log

export VLLM_FLOAT32_MATMUL_PRECISION=high
export VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=0

if [ "${DP_ATTENTION}" = "true" ]; then
PARALLEL_ARGS="--tensor-parallel-size 1 --data-parallel-size $TP --enable-expert-parallel"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
SERVER_LOG=/workspace/server.log

export VLLM_FLOAT32_MATMUL_PRECISION=high
export VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=0

if [ "$EP_SIZE" -gt 1 ]; then
EP=" --enable-expert-parallel"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ nvidia-smi
SERVER_LOG=/workspace/server.log

export VLLM_FLOAT32_MATMUL_PRECISION=high
export VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=0

if [ "$EP_SIZE" -gt 1 ]; then
EP=" --enable-expert-parallel"
Expand Down
10 changes: 10 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3531,3 +3531,13 @@
- "The Rust frontend replaces only the Python serving/API layer (HTTP, tokenization, scheduling glue, detokenization) and spawns the same Python EngineCore, so GPU kernels/attention/MoE GEMM/KV cache are untouched"
- "A/B sweep (28 single-node points, 1k1k + 8k1k, TP 1/2/4) vs the Python-frontend baseline (run 26696260751): throughput Pareto-neutral (peak tok/s/GPU within <1.5%, frontiers coincident) and TPOT flat (+-0.5%); TTFT improves ~8% at 1k1k and ~22% at 8k1k (every point), the expected signature of lower frontend CPU latency before first token, scaling with input length"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1634

- config-keys:
- minimaxm2.5-fp8-b200-vllm
- minimaxm2.5-fp8-b300-vllm
- minimaxm2.5-fp4-b200-vllm
- minimaxm2.5-fp4-b300-vllm
description:
- "Use vLLM image v0.22.0 for MiniMax-M2.5 FP8/FP4 B200/B300 aggregate benchmarks."
- "Set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=0 in the B200/B300 MiniMax aggregate launch scripts."
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1704
Loading