Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4488,6 +4488,34 @@ minimaxm2.5-fp4-b300-vllm:
- { tp: 4, conc-start: 4, conc-end: 8 }
- { tp: 8, conc-start: 4, conc-end: 4 }

minimaxm2.5-fp4-b300-trt:
image: nvcr.io#nvidia/tensorrt-llm/release:1.3.0rc18
model: nvidia/MiniMax-M2.5-NVFP4
model-prefix: minimaxm2.5
runner: b300
precision: fp4
framework: trt
multinode: false
scenarios:
fixed-seq-len:
- isl: 1024
osl: 1024
search-space:
- { tp: 2, ep: 2, dp-attn: true, conc-start: 2048, conc-end: 2048 }
- { tp: 4, ep: 4, dp-attn: true, conc-start: 1024, conc-end: 2048 }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 1024 }
- { tp: 1, ep: 1, dp-attn: false, conc-start: 4, conc-end: 8 }
- { tp: 4, ep: 2, dp-attn: false, conc-start: 4, conc-end: 8 }
- { tp: 8, ep: 2, dp-attn: false, conc-start: 4, conc-end: 4 }
- isl: 8192
osl: 1024
search-space:
- { tp: 2, ep: 2, dp-attn: true, conc-start: 128, conc-end: 512 }
- { tp: 4, ep: 4, dp-attn: true, conc-start: 128, conc-end: 128 }
- { tp: 1, ep: 1, dp-attn: false, conc-start: 4, conc-end: 16 }
- { tp: 4, ep: 2, dp-attn: false, conc-start: 4, conc-end: 4 }
- { tp: 8, ep: 2, dp-attn: false, conc-start: 4, conc-end: 4 }

gptoss-fp4-h100-vllm:
image: vllm/vllm-openai:v0.21.0
model: openai/gpt-oss-120b
Expand Down
150 changes: 150 additions & 0 deletions benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp4_b300_trt.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
#!/usr/bin/env bash

# Source benchmark utilities early
source "$(dirname "$0")/../../benchmark_lib.sh"

check_env_vars \
MODEL \
TP \
CONC \
ISL \
OSL \
MAX_MODEL_LEN \
RANDOM_RANGE_RATIO \
RESULT_FILENAME \
DP_ATTENTION \
EP_SIZE

if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
fi

echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION"

MAX_NUM_TOKENS=16384
MAX_CAPTURE_TOKENS=$(( MAX_NUM_TOKENS < CONC * ISL ? MAX_NUM_TOKENS : CONC * ISL ))
CAPTURE_TOKENS_LIST=(1 2 4 8 12 16 24 32 48 64 96 128 192 256 384 512 768)
CAPTURE_TOKENS_LIST+=( $(seq 1024 128 2047))
CAPTURE_TOKENS_LIST+=( $(seq 2048 256 4095))
if [[ $MAX_CAPTURE_TOKENS -ge 4096 ]]; then
CAPTURE_TOKENS_LIST+=( $(seq 4096 512 $MAX_CAPTURE_TOKENS))
fi
CAPTURE_TOKENS_LIST=$(printf "%s, " "${CAPTURE_TOKENS_LIST[@]}")

CAPTURE_BATCH_LIST=(1 2 4 8 12 )
if [[ $CONC -ge 16 ]]; then
MAX_CAPTURE_BATCH=$(( CONC < 256 ? CONC : 255 ))
CAPTURE_BATCH_LIST+=( $(seq 16 8 $MAX_CAPTURE_BATCH ))
fi
if [[ $CONC -ge 256 ]]; then
MAX_CAPTURE_BATCH=$(( CONC < 512 ? CONC : 511 ))
CAPTURE_BATCH_LIST+=( $(seq 256 16 $MAX_CAPTURE_BATCH))
fi
if [[ $CONC -ge 512 ]]; then
MAX_CAPTURE_BATCH=$(( CONC < 768 ? CONC : 767 ))
CAPTURE_BATCH_LIST+=( $(seq 512 32 $MAX_CAPTURE_BATCH))
fi
if [[ $CONC -ge 1024 ]]; then
CAPTURE_BATCH_LIST+=( $(seq 768 64 $CONC))
fi
CAPTURE_BATCH_LIST=$(printf "%s, " "${CAPTURE_BATCH_LIST[@]}")
MAX_CAPTURE_TOKENS=$(( CONC < 16 ? 4096 : MAX_NUM_TOKENS ))

CONFIG_FILE="minimax-fp4.yaml"
cat << EOF > $CONFIG_FILE
cuda_graph_config:
enable_padding: true
batch_sizes: [${CAPTURE_BATCH_LIST%, }]
moe_config:
backend: TRTLLM
use_low_precision_moe_combine: true
enable_attention_dp: $DP_ATTENTION
torch_compile_config:
capture_num_tokens: [${CAPTURE_TOKENS_LIST%, }]
enable_piecewise_cuda_graph: true
stream_interval: 100
print_iter_log: true
max_num_tokens: $MAX_NUM_TOKENS
kv_cache_config:
free_gpu_memory_fraction: 0.9
enable_block_reuse: False
dtype: fp8
scheduler_config:
capacity_scheduler_policy: MAX_UTILIZATION
context_chunking_policy: FIRST_COME_FIRST_SERVED
nvfp4_gemm_config:
allowed_backends:
- cutlass
- cublaslt
- cutedsl
- cuda_core
max_seq_len: $MAX_MODEL_LEN
num_postprocess_workers: 4
EOF

if [[ $DP_ATTENTION == true ]]; then
cat << EOF >> $CONFIG_FILE
attention_dp_config:
enable_balance: true
EOF
fi

# `hf download` creates the target dir if missing and is itself idempotent.
# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE.
# Either way, MODEL_PATH is what the server is launched with.
if [[ -n "${MODEL_PATH:-}" ]]; then
if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
hf download "$MODEL" --local-dir "$MODEL_PATH"
fi
else
hf download "$MODEL"
export MODEL_PATH="$MODEL"
fi

SERVER_LOG=/workspace/server.log
PORT=${PORT:-8888}

echo "Generated config file contents:"
cat $CONFIG_FILE

# Start GPU monitoring (power, temperature, clocks every second)
start_gpu_monitor

set -x

# Launch TRT-LLM server
mpirun -n 1 --oversubscribe --allow-run-as-root \
trtllm-serve "$MODEL_PATH" --port=$PORT \
--trust_remote_code \
--backend=pytorch \
--max_batch_size $CONC \
--tp_size=$TP --ep_size=$EP_SIZE \
--config=$CONFIG_FILE \
> $SERVER_LOG 2>&1 &

SERVER_PID=$!

# Wait for server to be ready
wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing eval-only context setup

Medium Severity

The script writes max_seq_len from MAX_MODEL_LEN into the TRT-LLM config and starts the server without an EVAL_ONLY branch. Eval-only jobs (supported by the benchmark workflow) never call setup_eval_context, so the server can keep a throughput-tuned context cap and fail or truncate lm-eval runs.

Fix in Cursor Fix in Web

Reviewed by Cursor Bugbot for commit 9c5522e. Configure here.


run_benchmark_serving \
--model "$MODEL" \
--port "$PORT" \
--backend openai \
--input-len "$ISL" \
--output-len "$OSL" \
--random-range-ratio "$RANDOM_RANGE_RATIO" \
--num-prompts $(( $CONC * 10 )) \
--max-concurrency "$CONC" \
--result-filename "$RESULT_FILENAME" \
--result-dir /workspace/

# After throughput, run evaluation only if RUN_EVAL is true
if [ "${RUN_EVAL}" = "true" ]; then
run_eval --framework lm-eval --port "$PORT"
append_lm_eval_summary
fi

# Stop GPU monitoring
stop_gpu_monitor
set +x
7 changes: 7 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3579,3 +3579,10 @@
- "MI355x DSR1-FP4: Include TP4 configurations for 8k1k"
- "Expand the TP sweep (included TP=4) for 8k/1k configuration for conc=4 to 64"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1692

- config-keys:
- minimaxm2.5-fp4-b300-trt
description:
- "Add MiniMax-M2.5 FP4 (NVFP4) B300 TensorRT-LLM benchmark (model: nvidia/MiniMax-M2.5-NVFP4)"
- "Image: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc18"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX
4 changes: 2 additions & 2 deletions runners/launch_b300-nv.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/bash

# System-specific configuration for B300 NV Slurm cluster
# System-specific configuration for B300 NV Slurm cluster (sa-shared)
SLURM_PARTITION="batch_1"
SLURM_ACCOUNT="benchmark"

Expand Down Expand Up @@ -339,7 +339,7 @@ else
export MODEL_PATH="${WRITABLE_MODELS_DIR%/}/${MODEL_BASENAME}"
fi

SQUASH_FILE="/data/home/sa-shared/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
SQUASH_FILE="/data/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')
# Prefer a framework-tagged script (e.g. dsv4_fp4_b300_sglang.sh) so models
# with multiple inference engines can coexist; fall back to the historical
Expand Down