Skip to content
216 changes: 216 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6696,6 +6696,222 @@ dsr1-fp4-b200-dynamo-sglang-mtp:
ep: 1
dp-attn: false

kimik2.5-fp4-gb200-dynamo-trt:
image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2
model: nvidia/Kimi-K2.5-NVFP4
model-prefix: kimik2.5
runner: gb200
precision: fp4
framework: dynamo-trt
multinode: true
disagg: true
seq-len-configs:
- isl: 1024
osl: 1024
search-space:
# Non-MTP configurations (default spec_decoding="none")
- conc-list: [ 4, 192, 360, 668 ]
prefill:
num-worker: 1
tp: 4
ep: 4
dp-attn: true
additional-settings:
# https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml
- "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml"
decode:
num-worker: 4
tp: 8
ep: 8
dp-attn: false
- conc-list: [ 5, 15, 30, 55 ]
prefill:
num-worker: 1
tp: 4
ep: 4
dp-attn: true
additional-settings:
# https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml
- "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml"
decode:
num-worker: 5
tp: 4
ep: 4
dp-attn: false
- conc-list: [ 666 ]
prefill:
num-worker: 1
tp: 4
ep: 4
dp-attn: true
additional-settings:
# https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml
- "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml"
decode:
num-worker: 1
tp: 16
ep: 16
dp-attn: true
- conc-list: [ 2253 ]
prefill:
num-worker: 1
tp: 4
ep: 4
dp-attn: true
additional-settings:
# https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml
- "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml"
decode:
num-worker: 1
tp: 32
ep: 32
dp-attn: true
- conc-list: [ 4301, 6452 ]
prefill:
num-worker: 1
tp: 4
ep: 4
dp-attn: true
additional-settings:
# https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml
- "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
- conc-list: [ 4301 ]
prefill:
num-worker: 2
tp: 4
ep: 4
dp-attn: true
additional-settings:
# https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml
- "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml"
decode:
num-worker: 1
tp: 16
ep: 16
dp-attn: true
- conc-list: [ 4301 ]
prefill:
num-worker: 2
tp: 4
ep: 4
dp-attn: true
additional-settings:
# https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml
- "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml"
decode:
num-worker: 1
tp: 32
ep: 32
dp-attn: true

- isl: 8192
osl: 1024
search-space:
# Non-MTP configurations (default spec_decoding="none")
- conc-list: [ 4 ]
prefill:
num-worker: 1
tp: 4
ep: 4
dp-attn: true
additional-settings:
# https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml
- "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml"
decode:
num-worker: 4
tp: 8
ep: 8
dp-attn: false
- conc-list: [ 156 ]
prefill:
num-worker: 1
tp: 4
ep: 4
dp-attn: true
additional-settings:
# https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml
- "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml"
decode:
num-worker: 4
tp: 4
ep: 4
dp-attn: false
- conc-list: [ 5, 15, 30, 60, 105 ]
prefill:
num-worker: 1
tp: 4
ep: 4
dp-attn: true
additional-settings:
# https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml
- "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml"
decode:
num-worker: 5
tp: 4
ep: 4
dp-attn: false
- conc-list: [ 333 ]
prefill:
num-worker: 2
tp: 4
ep: 4
dp-attn: true
additional-settings:
# https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml
- "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml"
decode:
num-worker: 1
tp: 16
ep: 16
dp-attn: true
- conc-list: [ 615 ]
prefill:
num-worker: 3
tp: 4
ep: 4
dp-attn: true
additional-settings:
# https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml
- "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml"
decode:
num-worker: 1
tp: 16
ep: 16
dp-attn: true
- conc-list: [ 2151 ]
prefill:
num-worker: 5
tp: 4
ep: 4
dp-attn: true
additional-settings:
# https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml
- "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
- conc-list: [ 2253 ]
prefill:
num-worker: 7
tp: 4
ep: 4
dp-attn: true
additional-settings:
# https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml
- "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml"
decode:
num-worker: 1
tp: 16
ep: 16
dp-attn: true

kimik2.5-fp4-gb200-dynamo-vllm:
image: vllm/vllm-openai:v0.18.0-cu130
model: nvidia/Kimi-K2.5-NVFP4
Expand Down
10 changes: 10 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,13 @@
- config-keys:
- kimik2.5-fp4-gb200-dynamo-trt
description:
- "Add Kimi K2.5 NVFP4 GB200 disaggregated TRT-LLM benchmarks via Dynamo (14 STP configs)"
- "New framework: dynamo-trt (Dynamo frontend + TensorRT-LLM backend)"
- "Container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2"
- "Recipes sourced from NVIDIA/srt-slurm branch sa-submission-q2-2026"
- "Runner script updated to support kimik2.5 model prefix with dynamo-trt framework"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1026

- config-keys:
- kimik2.5-int4-mi300x-vllm
description:
Expand Down
10 changes: 9 additions & 1 deletion runners/launch_gb200-nv.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,12 @@ elif [[ $FRAMEWORK == "dynamo-trt" ]]; then
export MODEL_PATH="/mnt/numa1/groups/sa-shared/models/deepseek-r1-0528/"
export SERVED_MODEL_NAME="deepseek-r1-fp8"
export SRT_SLURM_MODEL_PREFIX="dsr1-fp8"
elif [[ $MODEL_PREFIX == "kimik2.5" && $PRECISION == "fp4" ]]; then
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

have we already done this for kimi dynamo vllm submission? @nlevin-ui

export MODEL_PATH="/mnt/lustre01/models/kimi-k2.5-nvfp4"
export SERVED_MODEL_NAME="kimi-k2.5-nvfp4"
export SRT_SLURM_MODEL_PREFIX="nvidia/Kimi-K2.5-NVFP4"
else
echo "Unsupported model prefix: $MODEL_PREFIX. Supported prefixes are: gptoss or dsr1"
echo "Unsupported model prefix: $MODEL_PREFIX. Supported prefixes are: gptoss, dsr1, or kimik2.5"
exit 1
fi
elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then
Expand Down Expand Up @@ -124,6 +128,10 @@ if [[ $FRAMEWORK == "dynamo-vllm" ]]; then
git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR"
cd "$SRT_REPO_DIR"
git checkout sa-submission-q2-2026
elif [[ $FRAMEWORK == "dynamo-trt" && $MODEL_PREFIX == "kimik2.5" ]]; then
git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR"
cd "$SRT_REPO_DIR"
git checkout sa-submission-q2-2026
else
git clone https://github.com/ishandhanani/srt-slurm.git "$SRT_REPO_DIR"
cd "$SRT_REPO_DIR"
Comment on lines 128 to 137
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 The new dynamo-trt+kimik2.5 git clone block (lines 131-134) is byte-for-byte identical to the existing dynamo-vllm block above it: both clone NVIDIA/srt-slurm and check out sa-submission-q2-2026. This is a maintenance hazard — a future change to the repo URL or branch name that is applied to only one block would cause silent behavioral divergence. The two conditions should be merged into a single branch: [[ $FRAMEWORK == "dynamo-vllm" || ($FRAMEWORK == "dynamo-trt" && $MODEL_PREFIX == "kimik2.5") ]].

Extended reasoning...

What the bug is and how it manifests

In runners/launch_gb200-nv.sh the script clones one of two different srt-slurm forks depending on the framework. The new PR adds a second elif arm for dynamo-trt+kimik2.5 that performs the exact same three operations as the existing dynamo-vllm arm: git clone https://github.com/NVIDIA/srt-slurm.git, cd "$SRT_REPO_DIR", and git checkout sa-submission-q2-2026. The two blocks are byte-for-byte identical.

The specific code path that triggers it

Lines 128-134 of the modified file:

if [[ $FRAMEWORK == "dynamo-vllm" ]]; then
    git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR"
    cd "$SRT_REPO_DIR"
    git checkout sa-submission-q2-2026
elif [[ $FRAMEWORK == "dynamo-trt" && $MODEL_PREFIX == "kimik2.5" ]]; then
    git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR"
    cd "$SRT_REPO_DIR"
    git checkout sa-submission-q2-2026

There is no technical reason for two separate branches — they do exactly the same thing.

Why existing code doesn't prevent it

There is no deduplication mechanism in a shell script; the interpreter executes whatever branch matches. Nothing prevents the two arms from silently diverging if only one is updated in a future commit.

What the impact would be

The code is functionally correct today. The risk is future maintenance: when (not if) the sa-submission-q2-2026 branch is retired or the NVIDIA repo URL changes, a developer updating the script may only find and update one of the two identical blocks. The other would silently continue using the stale URL or branch, causing hard-to-diagnose failures only for one of the two frameworks.

How to fix it

Merge the two arms into a single condition:

if [[ $FRAMEWORK == "dynamo-vllm" || ($FRAMEWORK == "dynamo-trt" && $MODEL_PREFIX == "kimik2.5") ]]; then
    git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR"
    cd "$SRT_REPO_DIR"
    git checkout sa-submission-q2-2026

Step-by-step proof of the duplication

  1. Locate line 128: if [[ $FRAMEWORK == "dynamo-vllm" ]] — body clones NVIDIA/srt-slurm and checks out sa-submission-q2-2026.
  2. Locate line 131: elif [[ $FRAMEWORK == "dynamo-trt" && $MODEL_PREFIX == "kimik2.5" ]] — body is character-for-character identical to step 1.
  3. Run diff on the two blocks: zero differences.
  4. Conclusion: the bodies are redundant and should share a single conditional.

Expand Down
Loading