Skip to content
8 changes: 4 additions & 4 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3049,7 +3049,7 @@ dsv4-fp4-b300-vllm-agentic:
- { tp: 8, ep: 8, dp-attn: true, offloading: cpu, conc-list: [128, 256, 512] }

dsv4-fp4-b300-trt:
image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-9aa3715
image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-c185066
model: deepseek-ai/DeepSeek-V4-Pro
model-prefix: dsv4
runner: b300
Expand All @@ -3063,7 +3063,7 @@ dsv4-fp4-b300-trt:
search-space:
- { tp: 4, conc-start: 1, conc-end: 64 }
- { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 256 }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 512, conc-end: 2048 }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 512, conc-end: 1024 }
- isl: 8192
osl: 1024
search-space:
Expand All @@ -3072,7 +3072,7 @@ dsv4-fp4-b300-trt:
- { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 1024 }

dsv4-fp4-b300-trt-mtp:
image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-9aa3715
image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-c185066
model: deepseek-ai/DeepSeek-V4-Pro
model-prefix: dsv4
runner: b300
Expand All @@ -3086,7 +3086,7 @@ dsv4-fp4-b300-trt-mtp:
search-space:
- { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp }
- { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 256, spec-decoding: mtp }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 512, conc-end: 2048, spec-decoding: mtp }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 512, conc-end: 1024, spec-decoding: mtp }
- isl: 8192
osl: 1024
search-space:
Expand Down
38 changes: 30 additions & 8 deletions benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt.sh
Original file line number Diff line number Diff line change
Expand Up @@ -59,23 +59,44 @@ sanitize_slurm_mpi_env_for_trtllm
export NCCL_NVLS_ENABLE="${NCCL_NVLS_ENABLE:-0}"
echo "NCCL_NVLS_ENABLE: $NCCL_NVLS_ENABLE"

export TRTLLM_SERVER_DISABLE_GC="${TRTLLM_SERVER_DISABLE_GC:-1}"
export TRTLLM_WORKER_DISABLE_GC="${TRTLLM_WORKER_DISABLE_GC:-1}"
export NCCL_GRAPH_MIXING_SUPPORT="${NCCL_GRAPH_MIXING_SUPPORT:-0}"
export MIMALLOC_PURGE_DELAY="${MIMALLOC_PURGE_DELAY:-0}"
export PYTORCH_CUDA_ALLOC_CONF="${PYTORCH_CUDA_ALLOC_CONF:-expandable_segments:True}"

nvidia-smi

SERVER_LOG="$PWD/server.log"
EXTRA_CONFIG_FILE="dsv4-fp4-trt.yml"

MOE_BACKEND="TRTLLM"
# MoE backend: TRTLLM at low/mid concurrency; switch to MEGAMOE_DEEPGEMM at the
# top concurrency for short ISL (1k).
if [[ "$ISL" -le 1024 && "$CONC" -ge 2048 ]]; then
MOE_BACKEND="${MOE_BACKEND:-MEGAMOE_DEEPGEMM}"
else
MOE_BACKEND="${MOE_BACKEND:-TRTLLM}"
fi

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

MoE threshold never matches sweep

Medium Severity

The non-MTP recipe only selects MEGAMOE_DEEPGEMM when CONC is at least 2048, but this PR caps the 1k tp8/ep8 DP-attn sweep at 1024. Scheduled runs never hit that branch, so high-concurrency points keep TRTLLM despite the comment and MTP sibling using MEGAMOE_DEEPGEMM from 512 upward.

Additional Locations (1)
Fix in Cursor Fix in Web

Reviewed by Cursor Bugbot for commit 9f02c5d. Configure here.

MAX_BATCH_SIZE=$(( CONC > 16 ? CONC : 16 ))
CUDA_GRAPH_MAX_BATCH_SIZE="$MAX_BATCH_SIZE"
KV_CACHE_FREE_MEM_FRACTION="${KV_CACHE_FREE_MEM_FRACTION:-0.50}"
# Cap CUDA-graph capture at batch 1024. TRTLLM_MLA_EXTRA_OVERLAP hands MLA
# prologue tensors across streams without record_stream(), so graph warmup at
# decode batch >1024 (repros at 1088, e.g. tp8/ep8 dp-attn conc-2048 on B300)
# hits a use-after-free -> CUDA_ERROR_ILLEGAL_ADDRESS. Fixed upstream in
# NVIDIA/TensorRT-LLM#15265; cap until that fix ships in the image. Runtime
# --max_batch_size stays = CONC, so batches >1024 just run eager.
CUDA_GRAPH_MAX_BATCH_SIZE=$(( MAX_BATCH_SIZE < 1024 ? MAX_BATCH_SIZE : 1024 ))
if [[ "$DP_ATTENTION" == "true" ]]; then
KV_CACHE_FREE_MEM_FRACTION="${KV_CACHE_FREE_MEM_FRACTION:-0.7}"
else
KV_CACHE_FREE_MEM_FRACTION="${KV_CACHE_FREE_MEM_FRACTION:-0.9}"
fi

ATTENTION_DP_CONFIG=""
if [[ "$DP_ATTENTION" == "true" ]]; then
ATTENTION_DP_CONFIG="
attention_dp_config:
batching_wait_iters: 0
enable_balance: true
timeout_iters: 60"
batching_wait_iters: 30
enable_balance: true"
fi

cat > "$EXTRA_CONFIG_FILE" << EOF
Expand All @@ -89,17 +110,18 @@ kv_cache_config:
dtype: fp8
free_gpu_memory_fraction: $KV_CACHE_FREE_MEM_FRACTION
enable_block_reuse: false
stream_interval: 10
stream_interval: 100
num_postprocess_workers: 4
moe_config:
backend: $MOE_BACKEND
use_low_precision_moe_combine: true
EOF

echo "Generated config file contents:"
cat "$EXTRA_CONFIG_FILE"

MAX_MODEL_LEN=$(( MAX_MODEL_LEN > 8192 ? MAX_MODEL_LEN : 8192 ))
MAX_NUM_TOKENS=$(( ISL + OSL + 256 ))
MAX_NUM_TOKENS=$(( ISL + 256 ))
MAX_NUM_TOKENS=$(( MAX_NUM_TOKENS > 8192 ? MAX_NUM_TOKENS : 8192 ))

if [ "${EVAL_ONLY}" = "true" ]; then
Expand Down
47 changes: 38 additions & 9 deletions benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt_mtp.sh
Original file line number Diff line number Diff line change
Expand Up @@ -58,24 +58,52 @@ sanitize_slurm_mpi_env_for_trtllm
export NCCL_NVLS_ENABLE="${NCCL_NVLS_ENABLE:-0}"
echo "NCCL_NVLS_ENABLE: $NCCL_NVLS_ENABLE"

export TRTLLM_SERVER_DISABLE_GC="${TRTLLM_SERVER_DISABLE_GC:-1}"
export TRTLLM_WORKER_DISABLE_GC="${TRTLLM_WORKER_DISABLE_GC:-1}"
export NCCL_GRAPH_MIXING_SUPPORT="${NCCL_GRAPH_MIXING_SUPPORT:-0}"
export MIMALLOC_PURGE_DELAY="${MIMALLOC_PURGE_DELAY:-0}"
export PYTORCH_CUDA_ALLOC_CONF="${PYTORCH_CUDA_ALLOC_CONF:-expandable_segments:True}"

nvidia-smi

SERVER_LOG="$PWD/server.log"
EXTRA_CONFIG_FILE="dsv4-fp4-trt-mtp.yml"

MOE_BACKEND="TRTLLM"
MTP="${TRTLLM_DSV4_MTP_NUM_NEXTN_LAYERS:-2}"
# MoE backend: TRTLLM at low/mid concurrency; switch to MEGAMOE_DEEPGEMM at high
# concurrency for short ISL (1k).
if [[ "$ISL" -le 1024 && "$CONC" -ge 512 ]]; then
MOE_BACKEND="${MOE_BACKEND:-MEGAMOE_DEEPGEMM}"
else
MOE_BACKEND="${MOE_BACKEND:-TRTLLM}"
fi
# MTP draft length: 3 at low/mid concurrency; steps down to 2 at high concurrency
# for long ISL (8k).
if [[ "$ISL" -ge 4096 && "$CONC" -ge 128 ]]; then
MTP="${TRTLLM_DSV4_MTP_NUM_NEXTN_LAYERS:-2}"
else
MTP="${TRTLLM_DSV4_MTP_NUM_NEXTN_LAYERS:-3}"
fi
MAX_BATCH_SIZE=$(( CONC > 16 ? CONC : 16 ))
CUDA_GRAPH_MAX_BATCH_SIZE="$MAX_BATCH_SIZE"
KV_CACHE_FREE_MEM_FRACTION="${KV_CACHE_FREE_MEM_FRACTION:-0.50}"
# Cap CUDA-graph capture at batch 1024. TRTLLM_MLA_EXTRA_OVERLAP hands MLA
# prologue tensors across streams without record_stream(), so graph warmup at
# decode batch >1024 (repros at 1088, e.g. tp8/ep8 dp-attn conc-2048 on B300)
# hits a use-after-free -> CUDA_ERROR_ILLEGAL_ADDRESS. Fixed upstream in
# NVIDIA/TensorRT-LLM#15265; cap until that fix ships in the image. Runtime
# --max_batch_size stays = CONC, so batches >1024 just run eager.
CUDA_GRAPH_MAX_BATCH_SIZE=$(( MAX_BATCH_SIZE < 1024 ? MAX_BATCH_SIZE : 1024 ))
if [[ "$DP_ATTENTION" == "true" ]]; then
KV_CACHE_FREE_MEM_FRACTION="${KV_CACHE_FREE_MEM_FRACTION:-0.6}"
else
KV_CACHE_FREE_MEM_FRACTION="${KV_CACHE_FREE_MEM_FRACTION:-0.9}"
fi

ATTENTION_DP_CONFIG=""
if [[ "$DP_ATTENTION" == "true" ]]; then
ATTENTION_DP_CONFIG="
attention_dp_config:
batching_wait_iters: 0
batching_wait_iters: 30
enable_balance: true
timeout_iters: 60"
enable_lm_head_tp_in_adp: true"
fi

cat > "$EXTRA_CONFIG_FILE" << EOF
Expand All @@ -89,20 +117,21 @@ kv_cache_config:
dtype: fp8
free_gpu_memory_fraction: $KV_CACHE_FREE_MEM_FRACTION
enable_block_reuse: false
stream_interval: 10
stream_interval: 100
num_postprocess_workers: 4
moe_config:
backend: $MOE_BACKEND
use_low_precision_moe_combine: true
speculative_config:
decoding_type: MTP
num_nextn_predict_layers: $MTP
max_draft_len: $MTP
EOF

echo "Generated config file contents:"
cat "$EXTRA_CONFIG_FILE"

MAX_MODEL_LEN=$(( MAX_MODEL_LEN > 8192 ? MAX_MODEL_LEN : 8192 ))
MAX_NUM_TOKENS=$(( ISL + OSL + (MTP + 1) * MAX_BATCH_SIZE + 256 ))
MAX_NUM_TOKENS=$(( ISL + (MTP + 1) * MAX_BATCH_SIZE + 256 ))
MAX_NUM_TOKENS=$(( MAX_NUM_TOKENS > 8192 ? MAX_NUM_TOKENS : 8192 ))

if [ "${EVAL_ONLY}" = "true" ]; then
Expand Down
11 changes: 11 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3543,6 +3543,17 @@
- "A/B sweep (28 single-node points, 1k1k + 8k1k, TP 1/2/4) vs the Python-frontend baseline (run 26696260751): throughput Pareto-neutral (peak tok/s/GPU within <1.5%, frontiers coincident) and TPOT flat (+-0.5%); TTFT improves ~8% at 1k1k and ~22% at 8k1k (every point), the expected signature of lower frontend CPU latency before first token, scaling with input length"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1634

- config-keys:
- dsv4-fp4-b300-trt
- dsv4-fp4-b300-trt-mtp
description:
- "Update the B300 TensorRT-LLM DeepSeek-V4-Pro image to ghcr.io/semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-c185066"
- "B300 analog of PR #1699 (B200): sync the dsv4-fp4-b300-trt and dsv4-fp4-b300-trt-mtp recipes with the agg frontier config (worker GC off, NCCL graph mixing off, mimalloc/PyTorch alloc tweaks, higher KV cache fractions by DP path, stream_interval 100, use_low_precision_moe_combine, DP batching_wait_iters 30, max_num_tokens drops the OSL term)"
- "MTP recipe uses max_draft_len with a variable default draft length, enable_lm_head_tp_in_adp on the DP-attn path, and removes timeout_iters from the DP config"
- "Cap cuda_graph_config.max_batch_size at 1024 on both recipes: TRTLLM_MLA_EXTRA_OVERLAP hands MLA prologue tensors across streams without record_stream(), so CUDA-graph warmup at decode batch >1024 (repros at 1088, e.g. tp8/ep8 dp-attn conc-2048 on B300) use-after-frees into CUDA_ERROR_ILLEGAL_ADDRESS; workaround until NVIDIA/TensorRT-LLM#15265 ships in the image. Runtime --max_batch_size stays = CONC, so batches >1024 run eager"
- "B300-specific bits preserved (MODEL_PATH download block, TRTLLM_MHC_ENABLE_FUSED_HC=1, trtllm-serve MODEL_PATH); drop the 1k1k conc-2048 point on the tp8/ep8 DP-attn row (both recipes), the batch regime that triggers the MLA-overlap crash above; rest of the search space unchanged"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1703

- config-keys:
- dsr1-fp4-b200-dynamo-sglang-mtp
description:
Expand Down