From 105dbb812fd621afb69a7c40bf5a0a1e918a19d5 Mon Sep 17 00:00:00 2001 From: Yeswanth Koti Date: Mon, 13 Apr 2026 17:25:21 +0000 Subject: [PATCH 1/4] Add Kimi K2.5 NVFP4 GB200 disaggregated TRT-LLM benchmarks via Dynamo Add kimik2.5-fp4-gb200-dynamo-trt configuration with 29 search-space entries covering ISL 1024/OSL 1024 (8 MTP + 7 STP) and ISL 8192/OSL 1024 (7 MTP + 7 STP) disaggregated configs. Update launch_gb200-nv.sh to support kimik2.5 model prefix with dynamo-trt framework and clone srt-slurm from NVIDIA/srt-slurm@sa-submission-q2-2026. --- .github/configs/nvidia-master.yaml | 445 +++++++++++++++++++++++++++++ runners/launch_gb200-nv.sh | 10 +- 2 files changed, 454 insertions(+), 1 deletion(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 15dc69195..e871154f4 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -6696,6 +6696,451 @@ dsr1-fp4-b200-dynamo-sglang-mtp: ep: 1 dp-attn: false +kimik2.5-fp4-gb200-dynamo-trt: + image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2 + model: nvidia/Kimi-K2.5-NVFP4 + model-prefix: kimik2.5 + runner: gb200 + precision: fp4 + framework: dynamo-trt + multinode: true + disagg: true + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + # MTP configurations (spec_decoding="mtp") + - spec-decoding: "mtp" + conc-list: [ 8, 48, 92, 192, 336 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen4tep8_batch64_allconc_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen4tep8_batch64_allconc_eplb0_mtp3.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [ 10, 15 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen5tep4_batch2_allconc_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen5tep4_batch2_allconc_eplb0_mtp3.yaml" + decode: + num-worker: 5 + tp: 4 + ep: 4 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [ 666 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep32_batch16_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep32_batch16_eplb0_mtp3.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [ 666 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep16_batch32_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep16_batch32_eplb0_mtp3.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [ 4301 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep8_batch512_eplb0_mtp1.yaml + - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep8_batch512_eplb0_mtp1.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [ 2253 ] + prefill: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx2dep4_gen1dep32_batch64_eplb0_mtp1.yaml + - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx2dep4_gen1dep32_batch64_eplb0_mtp1.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [ 2253 ] + prefill: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx2dep4_gen1dep16_batch128_eplb0_mtp1.yaml + - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx2dep4_gen1dep16_batch128_eplb0_mtp1.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [ 6759 ] + prefill: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx2dep4_gen3dep8_batch256_eplb0_mtp1.yaml + - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx2dep4_gen3dep8_batch256_eplb0_mtp1.yaml" + decode: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: true + + # Non-MTP configurations (default spec_decoding="none") + - conc-list: [ 4, 192, 360, 668 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [ 5, 15, 30, 55 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml" + decode: + num-worker: 5 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [ 666 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + - conc-list: [ 2253 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + - conc-list: [ 4301, 6452 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [ 4301 ] + prefill: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + - conc-list: [ 4301 ] + prefill: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + + - isl: 8192 + osl: 1024 + search-space: + # MTP configurations (spec_decoding="mtp") + - spec-decoding: "mtp" + conc-list: [ 8 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen4tep8_batch1_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen4tep8_batch1_eplb0_mtp3.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [ 90 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen2tep8_batch32_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen2tep8_batch32_eplb0_mtp3.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [ 10, 15, 60 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp3.yaml" + decode: + num-worker: 5 + tp: 4 + ep: 4 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [ 180 ] + prefill: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx2dep4_gen1dep16_batch8_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx2dep4_gen1dep16_batch8_eplb0_mtp3.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [ 666 ] + prefill: + num-worker: 5 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx5dep4_gen1dep32_batch16_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx5dep4_gen1dep32_batch16_eplb0_mtp3.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [ 1229, 2253 ] + prefill: + num-worker: 5 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp1.yaml + - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp1.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [ 1229 ] + prefill: + num-worker: 8 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx8dep4_gen1dep32_batch32_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx8dep4_gen1dep32_batch32_eplb0_mtp3.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + + # Non-MTP configurations (default spec_decoding="none") + - conc-list: [ 4 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [ 156 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml" + decode: + num-worker: 4 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [ 5, 15, 30, 60, 105 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml" + decode: + num-worker: 5 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [ 333 ] + prefill: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + - conc-list: [ 615 ] + prefill: + num-worker: 3 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + - conc-list: [ 2151 ] + prefill: + num-worker: 5 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [ 2253 ] + prefill: + num-worker: 7 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + kimik2.5-fp4-gb200-dynamo-vllm: image: vllm/vllm-openai:v0.18.0-cu130 model: nvidia/Kimi-K2.5-NVFP4 diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index e0e55481f..da881f704 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -30,8 +30,12 @@ elif [[ $FRAMEWORK == "dynamo-trt" ]]; then export MODEL_PATH="/mnt/numa1/groups/sa-shared/models/deepseek-r1-0528/" export SERVED_MODEL_NAME="deepseek-r1-fp8" export SRT_SLURM_MODEL_PREFIX="dsr1-fp8" + elif [[ $MODEL_PREFIX == "kimik2.5" && $PRECISION == "fp4" ]]; then + export MODEL_PATH="/mnt/lustre01/models/kimi-k2.5-nvfp4" + export SERVED_MODEL_NAME="kimi-k2.5-nvfp4" + export SRT_SLURM_MODEL_PREFIX="kimi-k2.5-nvfp4" else - echo "Unsupported model prefix: $MODEL_PREFIX. Supported prefixes are: gptoss or dsr1" + echo "Unsupported model prefix: $MODEL_PREFIX. Supported prefixes are: gptoss, dsr1, or kimik2.5" exit 1 fi elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then @@ -124,6 +128,10 @@ if [[ $FRAMEWORK == "dynamo-vllm" ]]; then git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" git checkout sa-submission-q2-2026 +elif [[ $FRAMEWORK == "dynamo-trt" && $MODEL_PREFIX == "kimik2.5" ]]; then + git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" + cd "$SRT_REPO_DIR" + git checkout sa-submission-q2-2026 else git clone https://github.com/ishandhanani/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" From fd80f08628e5e9f4d76a5cffc9d1f26aa9a6df39 Mon Sep 17 00:00:00 2001 From: Yeswanth Koti Date: Tue, 14 Apr 2026 19:25:21 +0000 Subject: [PATCH 2/4] Remove MTP configurations from Kimi K2.5 GB200 dynamo-trt benchmarks Remove speculative decoding (MTP) search-space entries from kimik2.5-fp4-gb200-dynamo-trt for both ISL 1024/OSL 1024 and ISL 8192/OSL 1024. Retain all 14 STP (non-MTP) configurations. --- .github/configs/nvidia-master.yaml | 229 ----------------------------- 1 file changed, 229 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index e871154f4..ba3885e7b 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -6709,128 +6709,6 @@ kimik2.5-fp4-gb200-dynamo-trt: - isl: 1024 osl: 1024 search-space: - # MTP configurations (spec_decoding="mtp") - - spec-decoding: "mtp" - conc-list: [ 8, 48, 92, 192, 336 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen4tep8_batch64_allconc_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen4tep8_batch64_allconc_eplb0_mtp3.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [ 10, 15 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen5tep4_batch2_allconc_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen5tep4_batch2_allconc_eplb0_mtp3.yaml" - decode: - num-worker: 5 - tp: 4 - ep: 4 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [ 666 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep32_batch16_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep32_batch16_eplb0_mtp3.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [ 666 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep16_batch32_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep16_batch32_eplb0_mtp3.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [ 4301 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep8_batch512_eplb0_mtp1.yaml - - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep8_batch512_eplb0_mtp1.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [ 2253 ] - prefill: - num-worker: 2 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx2dep4_gen1dep32_batch64_eplb0_mtp1.yaml - - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx2dep4_gen1dep32_batch64_eplb0_mtp1.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [ 2253 ] - prefill: - num-worker: 2 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx2dep4_gen1dep16_batch128_eplb0_mtp1.yaml - - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx2dep4_gen1dep16_batch128_eplb0_mtp1.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [ 6759 ] - prefill: - num-worker: 2 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx2dep4_gen3dep8_batch256_eplb0_mtp1.yaml - - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx2dep4_gen3dep8_batch256_eplb0_mtp1.yaml" - decode: - num-worker: 3 - tp: 8 - ep: 8 - dp-attn: true - # Non-MTP configurations (default spec_decoding="none") - conc-list: [ 4, 192, 360, 668 ] prefill: @@ -6934,113 +6812,6 @@ kimik2.5-fp4-gb200-dynamo-trt: - isl: 8192 osl: 1024 search-space: - # MTP configurations (spec_decoding="mtp") - - spec-decoding: "mtp" - conc-list: [ 8 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen4tep8_batch1_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen4tep8_batch1_eplb0_mtp3.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [ 90 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen2tep8_batch32_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen2tep8_batch32_eplb0_mtp3.yaml" - decode: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [ 10, 15, 60 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp3.yaml" - decode: - num-worker: 5 - tp: 4 - ep: 4 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [ 180 ] - prefill: - num-worker: 2 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx2dep4_gen1dep16_batch8_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx2dep4_gen1dep16_batch8_eplb0_mtp3.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [ 666 ] - prefill: - num-worker: 5 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx5dep4_gen1dep32_batch16_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx5dep4_gen1dep32_batch16_eplb0_mtp3.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [ 1229, 2253 ] - prefill: - num-worker: 5 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp1.yaml - - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp1.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [ 1229 ] - prefill: - num-worker: 8 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx8dep4_gen1dep32_batch32_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/MTP/ctx8dep4_gen1dep32_batch32_eplb0_mtp3.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - # Non-MTP configurations (default spec_decoding="none") - conc-list: [ 4 ] prefill: From 072a372a6c854d73c88e35245bbd57ae9f041ca6 Mon Sep 17 00:00:00 2001 From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com> Date: Tue, 14 Apr 2026 20:21:43 +0000 Subject: [PATCH 3/4] Add perf changelog entry for Kimi K2.5 NVFP4 GB200 dynamo-trt benchmarks Co-authored-by: Cameron Quilici --- perf-changelog.yaml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 746d0645d..9c49ba920 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1,3 +1,13 @@ +- config-keys: + - kimik2.5-fp4-gb200-dynamo-trt + description: + - "Add Kimi K2.5 NVFP4 GB200 disaggregated TRT-LLM benchmarks via Dynamo (14 STP configs)" + - "New framework: dynamo-trt (Dynamo frontend + TensorRT-LLM backend)" + - "Container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2" + - "Recipes sourced from NVIDIA/srt-slurm branch sa-submission-q2-2026" + - "Runner script updated to support kimik2.5 model prefix with dynamo-trt framework" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1026 + - config-keys: - kimik2.5-int4-mi300x-vllm description: From 49f12bc9cd118ae7897d40cb7a9cbdbd69b09562 Mon Sep 17 00:00:00 2001 From: Yeswanth Koti Date: Tue, 14 Apr 2026 21:02:07 +0000 Subject: [PATCH 4/4] Fix SRT_SLURM_MODEL_PREFIX for Kimi K2.5 dynamo-trt to match recipe model.path Made-with: Cursor --- runners/launch_gb200-nv.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index da881f704..153bcd0f6 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -33,7 +33,7 @@ elif [[ $FRAMEWORK == "dynamo-trt" ]]; then elif [[ $MODEL_PREFIX == "kimik2.5" && $PRECISION == "fp4" ]]; then export MODEL_PATH="/mnt/lustre01/models/kimi-k2.5-nvfp4" export SERVED_MODEL_NAME="kimi-k2.5-nvfp4" - export SRT_SLURM_MODEL_PREFIX="kimi-k2.5-nvfp4" + export SRT_SLURM_MODEL_PREFIX="nvidia/Kimi-K2.5-NVFP4" else echo "Unsupported model prefix: $MODEL_PREFIX. Supported prefixes are: gptoss, dsr1, or kimik2.5" exit 1