Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1801,7 +1801,7 @@ dsv4-fp4-b200-vllm-agentic:
- { tp: 8, ep: 8, dp-attn: true, offloading: cpu, conc-list: [64, 128, 256] }

dsv4-fp4-b200-trt:
image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-9aa3715
image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-c185066
model: deepseek-ai/DeepSeek-V4-Pro
model-prefix: dsv4
runner: b200-dsv4
Expand All @@ -1814,15 +1814,15 @@ dsv4-fp4-b200-trt:
osl: 1024
search-space:
- { tp: 8, conc-start: 1, conc-end: 32 }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 2048 }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 512 }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, conc-start: 1, conc-end: 32 }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 1024 }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 512 }

dsv4-fp4-b200-trt-mtp:
image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-9aa3715
image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-c185066
model: deepseek-ai/DeepSeek-V4-Pro
model-prefix: dsv4
runner: b200-dsv4
Expand All @@ -1835,12 +1835,12 @@ dsv4-fp4-b200-trt-mtp:
osl: 1024
search-space:
- { tp: 8, conc-start: 1, conc-end: 32, spec-decoding: mtp }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 512, spec-decoding: mtp }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 256, spec-decoding: mtp }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, conc-start: 1, conc-end: 32, spec-decoding: mtp }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 128, spec-decoding: mtp }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 256, spec-decoding: mtp }

# MTP variant of dsv4-fp4-b200-vllm. Mirrors the base search space and adds
# --speculative-config '{"method":"mtp","num_speculative_tokens":2}'.
Expand Down
30 changes: 23 additions & 7 deletions benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt.sh
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,12 @@ sanitize_slurm_mpi_env_for_trtllm
export NCCL_NVLS_ENABLE="${NCCL_NVLS_ENABLE:-0}"
echo "NCCL_NVLS_ENABLE: $NCCL_NVLS_ENABLE"

export TRTLLM_SERVER_DISABLE_GC="${TRTLLM_SERVER_DISABLE_GC:-1}"
export TRTLLM_WORKER_DISABLE_GC="${TRTLLM_WORKER_DISABLE_GC:-1}"
export NCCL_GRAPH_MIXING_SUPPORT="${NCCL_GRAPH_MIXING_SUPPORT:-0}"
export MIMALLOC_PURGE_DELAY="${MIMALLOC_PURGE_DELAY:-0}"
export PYTORCH_CUDA_ALLOC_CONF="${PYTORCH_CUDA_ALLOC_CONF:-expandable_segments:True}"

if [[ "$MODEL" != /* ]]; then
hf download "$MODEL"
fi
Expand All @@ -56,18 +62,27 @@ nvidia-smi
SERVER_LOG="$PWD/server.log"
EXTRA_CONFIG_FILE="dsv4-fp4-trt.yml"

MOE_BACKEND="TRTLLM"
# MoE backend: TRTLLM at low/mid concurrency; switch to MEGAMOE_DEEPGEMM at the
# top concurrency for short ISL (1k).
if [[ "$ISL" -le 1024 && "$CONC" -ge 2048 ]]; then
MOE_BACKEND="${MOE_BACKEND:-MEGAMOE_DEEPGEMM}"
else
MOE_BACKEND="${MOE_BACKEND:-TRTLLM}"
fi
MAX_BATCH_SIZE=$(( CONC > 16 ? CONC : 16 ))
CUDA_GRAPH_MAX_BATCH_SIZE="$MAX_BATCH_SIZE"
KV_CACHE_FREE_MEM_FRACTION="${KV_CACHE_FREE_MEM_FRACTION:-0.50}"
if [[ "$DP_ATTENTION" == "true" ]]; then
KV_CACHE_FREE_MEM_FRACTION="${KV_CACHE_FREE_MEM_FRACTION:-0.7}"
else
KV_CACHE_FREE_MEM_FRACTION="${KV_CACHE_FREE_MEM_FRACTION:-0.9}"
fi

ATTENTION_DP_CONFIG=""
if [[ "$DP_ATTENTION" == "true" ]]; then
ATTENTION_DP_CONFIG="
attention_dp_config:
batching_wait_iters: 0
enable_balance: true
timeout_iters: 60"
batching_wait_iters: 30
enable_balance: true"
fi

cat > "$EXTRA_CONFIG_FILE" << EOF
Expand All @@ -81,17 +96,18 @@ kv_cache_config:
dtype: fp8
free_gpu_memory_fraction: $KV_CACHE_FREE_MEM_FRACTION
enable_block_reuse: false
stream_interval: 10
stream_interval: 100
num_postprocess_workers: 4
moe_config:
backend: $MOE_BACKEND
use_low_precision_moe_combine: true
EOF

echo "Generated config file contents:"
cat "$EXTRA_CONFIG_FILE"

MAX_MODEL_LEN=$(( MAX_MODEL_LEN > 8192 ? MAX_MODEL_LEN : 8192 ))
MAX_NUM_TOKENS=$(( ISL + OSL + 256 ))
MAX_NUM_TOKENS=$(( ISL + 256 ))
MAX_NUM_TOKENS=$(( MAX_NUM_TOKENS > 8192 ? MAX_NUM_TOKENS : 8192 ))

if [ "${EVAL_ONLY}" = "true" ]; then
Expand Down
39 changes: 31 additions & 8 deletions benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,12 @@ sanitize_slurm_mpi_env_for_trtllm
export NCCL_NVLS_ENABLE="${NCCL_NVLS_ENABLE:-0}"
echo "NCCL_NVLS_ENABLE: $NCCL_NVLS_ENABLE"

export TRTLLM_SERVER_DISABLE_GC="${TRTLLM_SERVER_DISABLE_GC:-1}"
export TRTLLM_WORKER_DISABLE_GC="${TRTLLM_WORKER_DISABLE_GC:-1}"
export NCCL_GRAPH_MIXING_SUPPORT="${NCCL_GRAPH_MIXING_SUPPORT:-0}"
export MIMALLOC_PURGE_DELAY="${MIMALLOC_PURGE_DELAY:-0}"
export PYTORCH_CUDA_ALLOC_CONF="${PYTORCH_CUDA_ALLOC_CONF:-expandable_segments:True}"

if [[ "$MODEL" != /* ]]; then
hf download "$MODEL"
fi
Expand All @@ -55,19 +61,35 @@ nvidia-smi
SERVER_LOG="$PWD/server.log"
EXTRA_CONFIG_FILE="dsv4-fp4-trt-mtp.yml"

MOE_BACKEND="TRTLLM"
MTP="${TRTLLM_DSV4_MTP_NUM_NEXTN_LAYERS:-2}"
# MoE backend: TRTLLM at low/mid concurrency; switch to MEGAMOE_DEEPGEMM at high
# concurrency for short ISL (1k).
if [[ "$ISL" -le 1024 && "$CONC" -ge 512 ]]; then
MOE_BACKEND="${MOE_BACKEND:-MEGAMOE_DEEPGEMM}"
else
MOE_BACKEND="${MOE_BACKEND:-TRTLLM}"

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

MOE threshold exceeds sweep maximum

Medium Severity

The MTP recipe switches MOE_BACKEND to MEGAMOE_DEEPGEMM only when ISL ≤ 1024 and CONC ≥ 512, but this PR caps DP-attn conc-end at 256 for both 1k and 8k ISL in nvidia-master.yaml, so automated sweeps never satisfy the condition and always use TRTLLM, diverging from the B300 frontier pattern the block was copied from.

Fix in Cursor Fix in Web

Reviewed by Cursor Bugbot for commit e707884. Configure here.

fi
# MTP draft length: 3 at low/mid concurrency; steps down to 2 at high concurrency
# for long ISL (8k).
if [[ "$ISL" -ge 4096 && "$CONC" -ge 128 ]]; then
MTP="${TRTLLM_DSV4_MTP_NUM_NEXTN_LAYERS:-2}"
else
MTP="${TRTLLM_DSV4_MTP_NUM_NEXTN_LAYERS:-3}"
fi
MAX_BATCH_SIZE=$(( CONC > 16 ? CONC : 16 ))
CUDA_GRAPH_MAX_BATCH_SIZE="$MAX_BATCH_SIZE"
KV_CACHE_FREE_MEM_FRACTION="${KV_CACHE_FREE_MEM_FRACTION:-0.50}"
if [[ "$DP_ATTENTION" == "true" ]]; then
KV_CACHE_FREE_MEM_FRACTION="${KV_CACHE_FREE_MEM_FRACTION:-0.6}"
else
KV_CACHE_FREE_MEM_FRACTION="${KV_CACHE_FREE_MEM_FRACTION:-0.9}"
fi

ATTENTION_DP_CONFIG=""
if [[ "$DP_ATTENTION" == "true" ]]; then
ATTENTION_DP_CONFIG="
attention_dp_config:
batching_wait_iters: 0
batching_wait_iters: 30
enable_balance: true
timeout_iters: 60"
enable_lm_head_tp_in_adp: true"
fi

cat > "$EXTRA_CONFIG_FILE" << EOF
Expand All @@ -81,20 +103,21 @@ kv_cache_config:
dtype: fp8
free_gpu_memory_fraction: $KV_CACHE_FREE_MEM_FRACTION
enable_block_reuse: false
stream_interval: 10
stream_interval: 100
num_postprocess_workers: 4
moe_config:
backend: $MOE_BACKEND
use_low_precision_moe_combine: true
speculative_config:
decoding_type: MTP
num_nextn_predict_layers: $MTP
max_draft_len: $MTP
EOF

echo "Generated config file contents:"
cat "$EXTRA_CONFIG_FILE"

MAX_MODEL_LEN=$(( MAX_MODEL_LEN > 8192 ? MAX_MODEL_LEN : 8192 ))
MAX_NUM_TOKENS=$(( ISL + OSL + (MTP + 1) * MAX_BATCH_SIZE + 256 ))
MAX_NUM_TOKENS=$(( ISL + (MTP + 1) * MAX_BATCH_SIZE + 256 ))
MAX_NUM_TOKENS=$(( MAX_NUM_TOKENS > 8192 ? MAX_NUM_TOKENS : 8192 ))

if [ "${EVAL_ONLY}" = "true" ]; then
Expand Down
10 changes: 10 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3543,6 +3543,16 @@
- "A/B sweep (28 single-node points, 1k1k + 8k1k, TP 1/2/4) vs the Python-frontend baseline (run 26696260751): throughput Pareto-neutral (peak tok/s/GPU within <1.5%, frontiers coincident) and TPOT flat (+-0.5%); TTFT improves ~8% at 1k1k and ~22% at 8k1k (every point), the expected signature of lower frontend CPU latency before first token, scaling with input length"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1634

- config-keys:
- dsv4-fp4-b200-trt
- dsv4-fp4-b200-trt-mtp
description:
- "Update the B200 TensorRT-LLM DeepSeek-V4-Pro image to ghcr.io/semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-c185066"
- "Sync the dsv4-fp4-b200-trt and dsv4-fp4-b200-trt-mtp recipes with the B200 aggregated frontier config (worker GC off, NCCL graph mixing off, mimalloc/PyTorch alloc tweaks, higher KV cache fractions by DP path, stream_interval 100, use_low_precision_moe_combine, DP batching_wait_iters 30, max_num_tokens drops the OSL term)"
- "MTP recipe uses max_draft_len with a variable default draft length, enable_lm_head_tp_in_adp on the DP-attn path, and removes timeout_iters from the DP config"
- "Raise dsv4-fp4-b200-trt-mtp DP-attn conc-end (1k ISL: 512->1024; 8k ISL: 128->256) to cover the new high-concurrency regime"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1699

- config-keys:
- dsv4-fp4-mi355x-atom-disagg
description:
Expand Down