diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 957dd0934..25777616d 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -3049,7 +3049,7 @@ dsv4-fp4-b300-vllm-agentic: - { tp: 8, ep: 8, dp-attn: true, offloading: cpu, conc-list: [128, 256, 512] } dsv4-fp4-b300-trt: - image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-9aa3715 + image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-c185066 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b300 @@ -3063,7 +3063,7 @@ dsv4-fp4-b300-trt: search-space: - { tp: 4, conc-start: 1, conc-end: 64 } - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 256 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 512, conc-end: 2048 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 512, conc-end: 1024 } - isl: 8192 osl: 1024 search-space: @@ -3072,7 +3072,7 @@ dsv4-fp4-b300-trt: - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 1024 } dsv4-fp4-b300-trt-mtp: - image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-9aa3715 + image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-c185066 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b300 @@ -3086,7 +3086,7 @@ dsv4-fp4-b300-trt-mtp: search-space: - { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp } - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 256, spec-decoding: mtp } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 512, conc-end: 2048, spec-decoding: mtp } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 512, conc-end: 1024, spec-decoding: mtp } - isl: 8192 osl: 1024 search-space: diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt.sh index b0150e10d..bcd1fbf6a 100644 --- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt.sh @@ -59,23 +59,44 @@ sanitize_slurm_mpi_env_for_trtllm export NCCL_NVLS_ENABLE="${NCCL_NVLS_ENABLE:-0}" echo "NCCL_NVLS_ENABLE: $NCCL_NVLS_ENABLE" +export TRTLLM_SERVER_DISABLE_GC="${TRTLLM_SERVER_DISABLE_GC:-1}" +export TRTLLM_WORKER_DISABLE_GC="${TRTLLM_WORKER_DISABLE_GC:-1}" +export NCCL_GRAPH_MIXING_SUPPORT="${NCCL_GRAPH_MIXING_SUPPORT:-0}" +export MIMALLOC_PURGE_DELAY="${MIMALLOC_PURGE_DELAY:-0}" +export PYTORCH_CUDA_ALLOC_CONF="${PYTORCH_CUDA_ALLOC_CONF:-expandable_segments:True}" + nvidia-smi SERVER_LOG="$PWD/server.log" EXTRA_CONFIG_FILE="dsv4-fp4-trt.yml" -MOE_BACKEND="TRTLLM" +# MoE backend: TRTLLM at low/mid concurrency; switch to MEGAMOE_DEEPGEMM at the +# top concurrency for short ISL (1k). +if [[ "$ISL" -le 1024 && "$CONC" -ge 2048 ]]; then + MOE_BACKEND="${MOE_BACKEND:-MEGAMOE_DEEPGEMM}" +else + MOE_BACKEND="${MOE_BACKEND:-TRTLLM}" +fi MAX_BATCH_SIZE=$(( CONC > 16 ? CONC : 16 )) -CUDA_GRAPH_MAX_BATCH_SIZE="$MAX_BATCH_SIZE" -KV_CACHE_FREE_MEM_FRACTION="${KV_CACHE_FREE_MEM_FRACTION:-0.50}" +# Cap CUDA-graph capture at batch 1024. TRTLLM_MLA_EXTRA_OVERLAP hands MLA +# prologue tensors across streams without record_stream(), so graph warmup at +# decode batch >1024 (repros at 1088, e.g. tp8/ep8 dp-attn conc-2048 on B300) +# hits a use-after-free -> CUDA_ERROR_ILLEGAL_ADDRESS. Fixed upstream in +# NVIDIA/TensorRT-LLM#15265; cap until that fix ships in the image. Runtime +# --max_batch_size stays = CONC, so batches >1024 just run eager. +CUDA_GRAPH_MAX_BATCH_SIZE=$(( MAX_BATCH_SIZE < 1024 ? MAX_BATCH_SIZE : 1024 )) +if [[ "$DP_ATTENTION" == "true" ]]; then + KV_CACHE_FREE_MEM_FRACTION="${KV_CACHE_FREE_MEM_FRACTION:-0.7}" +else + KV_CACHE_FREE_MEM_FRACTION="${KV_CACHE_FREE_MEM_FRACTION:-0.9}" +fi ATTENTION_DP_CONFIG="" if [[ "$DP_ATTENTION" == "true" ]]; then ATTENTION_DP_CONFIG=" attention_dp_config: - batching_wait_iters: 0 - enable_balance: true - timeout_iters: 60" + batching_wait_iters: 30 + enable_balance: true" fi cat > "$EXTRA_CONFIG_FILE" << EOF @@ -89,17 +110,18 @@ kv_cache_config: dtype: fp8 free_gpu_memory_fraction: $KV_CACHE_FREE_MEM_FRACTION enable_block_reuse: false -stream_interval: 10 +stream_interval: 100 num_postprocess_workers: 4 moe_config: backend: $MOE_BACKEND + use_low_precision_moe_combine: true EOF echo "Generated config file contents:" cat "$EXTRA_CONFIG_FILE" MAX_MODEL_LEN=$(( MAX_MODEL_LEN > 8192 ? MAX_MODEL_LEN : 8192 )) -MAX_NUM_TOKENS=$(( ISL + OSL + 256 )) +MAX_NUM_TOKENS=$(( ISL + 256 )) MAX_NUM_TOKENS=$(( MAX_NUM_TOKENS > 8192 ? MAX_NUM_TOKENS : 8192 )) if [ "${EVAL_ONLY}" = "true" ]; then diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt_mtp.sh index 507b96e34..bb0362c25 100644 --- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt_mtp.sh @@ -58,24 +58,52 @@ sanitize_slurm_mpi_env_for_trtllm export NCCL_NVLS_ENABLE="${NCCL_NVLS_ENABLE:-0}" echo "NCCL_NVLS_ENABLE: $NCCL_NVLS_ENABLE" +export TRTLLM_SERVER_DISABLE_GC="${TRTLLM_SERVER_DISABLE_GC:-1}" +export TRTLLM_WORKER_DISABLE_GC="${TRTLLM_WORKER_DISABLE_GC:-1}" +export NCCL_GRAPH_MIXING_SUPPORT="${NCCL_GRAPH_MIXING_SUPPORT:-0}" +export MIMALLOC_PURGE_DELAY="${MIMALLOC_PURGE_DELAY:-0}" +export PYTORCH_CUDA_ALLOC_CONF="${PYTORCH_CUDA_ALLOC_CONF:-expandable_segments:True}" + nvidia-smi SERVER_LOG="$PWD/server.log" EXTRA_CONFIG_FILE="dsv4-fp4-trt-mtp.yml" -MOE_BACKEND="TRTLLM" -MTP="${TRTLLM_DSV4_MTP_NUM_NEXTN_LAYERS:-2}" +# MoE backend: TRTLLM at low/mid concurrency; switch to MEGAMOE_DEEPGEMM at high +# concurrency for short ISL (1k). +if [[ "$ISL" -le 1024 && "$CONC" -ge 512 ]]; then + MOE_BACKEND="${MOE_BACKEND:-MEGAMOE_DEEPGEMM}" +else + MOE_BACKEND="${MOE_BACKEND:-TRTLLM}" +fi +# MTP draft length: 3 at low/mid concurrency; steps down to 2 at high concurrency +# for long ISL (8k). +if [[ "$ISL" -ge 4096 && "$CONC" -ge 128 ]]; then + MTP="${TRTLLM_DSV4_MTP_NUM_NEXTN_LAYERS:-2}" +else + MTP="${TRTLLM_DSV4_MTP_NUM_NEXTN_LAYERS:-3}" +fi MAX_BATCH_SIZE=$(( CONC > 16 ? CONC : 16 )) -CUDA_GRAPH_MAX_BATCH_SIZE="$MAX_BATCH_SIZE" -KV_CACHE_FREE_MEM_FRACTION="${KV_CACHE_FREE_MEM_FRACTION:-0.50}" +# Cap CUDA-graph capture at batch 1024. TRTLLM_MLA_EXTRA_OVERLAP hands MLA +# prologue tensors across streams without record_stream(), so graph warmup at +# decode batch >1024 (repros at 1088, e.g. tp8/ep8 dp-attn conc-2048 on B300) +# hits a use-after-free -> CUDA_ERROR_ILLEGAL_ADDRESS. Fixed upstream in +# NVIDIA/TensorRT-LLM#15265; cap until that fix ships in the image. Runtime +# --max_batch_size stays = CONC, so batches >1024 just run eager. +CUDA_GRAPH_MAX_BATCH_SIZE=$(( MAX_BATCH_SIZE < 1024 ? MAX_BATCH_SIZE : 1024 )) +if [[ "$DP_ATTENTION" == "true" ]]; then + KV_CACHE_FREE_MEM_FRACTION="${KV_CACHE_FREE_MEM_FRACTION:-0.6}" +else + KV_CACHE_FREE_MEM_FRACTION="${KV_CACHE_FREE_MEM_FRACTION:-0.9}" +fi ATTENTION_DP_CONFIG="" if [[ "$DP_ATTENTION" == "true" ]]; then ATTENTION_DP_CONFIG=" attention_dp_config: - batching_wait_iters: 0 + batching_wait_iters: 30 enable_balance: true - timeout_iters: 60" +enable_lm_head_tp_in_adp: true" fi cat > "$EXTRA_CONFIG_FILE" << EOF @@ -89,20 +117,21 @@ kv_cache_config: dtype: fp8 free_gpu_memory_fraction: $KV_CACHE_FREE_MEM_FRACTION enable_block_reuse: false -stream_interval: 10 +stream_interval: 100 num_postprocess_workers: 4 moe_config: backend: $MOE_BACKEND + use_low_precision_moe_combine: true speculative_config: decoding_type: MTP - num_nextn_predict_layers: $MTP + max_draft_len: $MTP EOF echo "Generated config file contents:" cat "$EXTRA_CONFIG_FILE" MAX_MODEL_LEN=$(( MAX_MODEL_LEN > 8192 ? MAX_MODEL_LEN : 8192 )) -MAX_NUM_TOKENS=$(( ISL + OSL + (MTP + 1) * MAX_BATCH_SIZE + 256 )) +MAX_NUM_TOKENS=$(( ISL + (MTP + 1) * MAX_BATCH_SIZE + 256 )) MAX_NUM_TOKENS=$(( MAX_NUM_TOKENS > 8192 ? MAX_NUM_TOKENS : 8192 )) if [ "${EVAL_ONLY}" = "true" ]; then diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 72764bcc8..e54dc7b6e 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3543,6 +3543,17 @@ - "A/B sweep (28 single-node points, 1k1k + 8k1k, TP 1/2/4) vs the Python-frontend baseline (run 26696260751): throughput Pareto-neutral (peak tok/s/GPU within <1.5%, frontiers coincident) and TPOT flat (+-0.5%); TTFT improves ~8% at 1k1k and ~22% at 8k1k (every point), the expected signature of lower frontend CPU latency before first token, scaling with input length" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1634 +- config-keys: + - dsv4-fp4-b300-trt + - dsv4-fp4-b300-trt-mtp + description: + - "Update the B300 TensorRT-LLM DeepSeek-V4-Pro image to ghcr.io/semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-c185066" + - "B300 analog of PR #1699 (B200): sync the dsv4-fp4-b300-trt and dsv4-fp4-b300-trt-mtp recipes with the agg frontier config (worker GC off, NCCL graph mixing off, mimalloc/PyTorch alloc tweaks, higher KV cache fractions by DP path, stream_interval 100, use_low_precision_moe_combine, DP batching_wait_iters 30, max_num_tokens drops the OSL term)" + - "MTP recipe uses max_draft_len with a variable default draft length, enable_lm_head_tp_in_adp on the DP-attn path, and removes timeout_iters from the DP config" + - "Cap cuda_graph_config.max_batch_size at 1024 on both recipes: TRTLLM_MLA_EXTRA_OVERLAP hands MLA prologue tensors across streams without record_stream(), so CUDA-graph warmup at decode batch >1024 (repros at 1088, e.g. tp8/ep8 dp-attn conc-2048 on B300) use-after-frees into CUDA_ERROR_ILLEGAL_ADDRESS; workaround until NVIDIA/TensorRT-LLM#15265 ships in the image. Runtime --max_batch_size stays = CONC, so batches >1024 run eager" + - "B300-specific bits preserved (MODEL_PATH download block, TRTLLM_MHC_ENABLE_FUSED_HC=1, trtllm-serve MODEL_PATH); drop the 1k1k conc-2048 point on the tp8/ep8 DP-attn row (both recipes), the batch regime that triggers the MLA-overlap crash above; rest of the search space unchanged" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1703 + - config-keys: - dsr1-fp4-b200-dynamo-sglang-mtp description: