From 31b4fbe4ff6f60642106f3fab63df1e050487712 Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Fri, 12 Jun 2026 21:38:15 +0900
Subject: [PATCH 1/5] [AMD] dsv4-fp4-mi355x-atom: enable DPA TBO at high
 concurrency, update image to atom0.1.4

- Enable --enable-tbo for ISL=1024/OSL=1024 at CONC>=1024 and ISL=8192/OSL=1024 at CONC>=256
- Update image to atom0.1.4_20260612
- Update ISL=8192 search-space to start at conc=4 and use DPA from conc=128

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml               | 19 +++++++++----------
 .../fixed_seq_len/dsv4_fp4_mi355x_atom.sh     | 11 +++++++++--
 2 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 70a79a273..77e4f0040 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -2257,15 +2257,8 @@ dsv4-fp4-mi355x-vllm-mtp:
       search-space:
       - { tp: 8, conc-start: 4, conc-end: 512, spec-decoding: mtp }
 
-# Day-0 single-sequence marker for DeepSeek-V4 on ATOM (ROCm/ATOM#650).
-# PR1 of the ATOM DSv4 series still uses torch sparse-attention fallbacks
-# that OOM once warmup/prefill batches multiple requests; keep CONC=1 until
-# the AITER sparse-attention kernel / multi-request path lands upstream.
-# --enforce-eager and ATOM_USE_TRITON_MOE=1 are required on gfx950. Image is
-# the standard atom0.1.2.post MI355X base (matching qwen3.5-fp8-mi355x-atom);
-# the DSv4 PR is overlaid at runtime by dsv4_fp4_mi355x_atom.sh at a pinned SHA.
 dsv4-fp4-mi355x-atom:
-  image: rocm/atom:rocm7.2.4_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.3
+  image: rocm/atom:rocm7.2.4_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.4_20260612
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: mi355x
@@ -2277,13 +2270,19 @@ dsv4-fp4-mi355x-atom:
     - isl: 1024
       osl: 1024
       search-space:
+        # conc4-64, TP8
+        # conc128-512, DPA
+        # conc1024, DPA TBO
       - { tp: 8, ep: 1, conc-start: 1, conc-end: 64 }
       - { tp: 8, ep: 1, dp-attn: true, conc-start: 64, conc-end: 1024 }
     - isl: 8192
       osl: 1024
       search-space:
-      - { tp: 8, ep: 1, conc-start: 1, conc-end: 64 }
-      - { tp: 8, ep: 1, dp-attn: true, conc-start: 64, conc-end: 512 }
+        # conc4-64, TP8
+        # conc128, DPA
+        # conc256-1024, DPA TBO
+      - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 }
+      - { tp: 8, ep: 1, dp-attn: true, conc-start: 128, conc-end: 1024 }
 
 dsv4-fp4-mi355x-atom-mtp:
   image: rocm/atom:rocm7.2.4_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.3
diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh
index e485dc9a6..4f4545824 100644
--- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh
@@ -25,8 +25,15 @@ PARALLEL_ARGS=(-tp "$TP") #TP
 if [ "$DP_ATTENTION" = "true" ]; then
     if [ "$EP_SIZE" -gt 1 ]; then #DP+EP
         PARALLEL_ARGS=(-tp "$TP" --enable-expert-parallel --enable-dp-attention )
-    else #DP+TP
-        PARALLEL_ARGS=(-tp "$TP" --enable-dp-attention )
+    else #DPA+TP
+        #DPA+TP+TBO
+        if [ "$ISL" -eq 1024 ] && [ "$OSL" -eq 1024 ] && [ "$CONC" -ge 1024 ]; then
+            PARALLEL_ARGS=(-tp "$TP" --enable-dp-attention --enable-tbo)
+        elif [ "$ISL" -eq 8192 ] && [ "$OSL" -eq 1024 ] && [ "$CONC" -ge 256 ]; then
+            PARALLEL_ARGS=(-tp "$TP" --enable-dp-attention --enable-tbo)
+        else
+            PARALLEL_ARGS=(-tp "$TP" --enable-dp-attention )
+        fi
     fi
 fi 
 

From c566e28e05a85be0c06bf531c6d6d92548f3aebf Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Fri, 12 Jun 2026 21:40:22 +0900
Subject: [PATCH 2/5] [AMD] perf-changelog: dsv4-fp4-mi355x-atom DPA TBO +
 image atom0.1.4

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 perf-changelog.yaml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index c0642188b..e68d5d3e0 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3600,3 +3600,11 @@
     - "MI355x DSR1-FP4: Include TP4 configurations for 8k1k"
     - "Expand the TP sweep (included TP=4) for 8k/1k configuration for conc=4 to 64"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1692
+
+- config-keys:
+    - dsv4-fp4-mi355x-atom
+  description:
+    - "Update image to rocm/atom:rocm7.2.4_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.4_20260612"
+    - "Enable --enable-tbo (Token-Bucket Overlap) on top of DPA+TP8 at high concurrency: ISL=1024/OSL=1024 at CONC>=1024, ISL=8192/OSL=1024 at CONC>=256"
+    - "Update ISL=8192 search-space: TP8-only from conc=4-64, DPA from conc=128-1024 (previously conc=1-64 and DPA conc=64-512)"
+  pr-link: 

From 7e1aa060dbbc69f072ac51a5c43e475b4014da01 Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Fri, 12 Jun 2026 21:41:35 +0900
Subject: [PATCH 3/5] [AMD] perf-changelog: add PR link #1717

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 perf-changelog.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index e68d5d3e0..f236a6d60 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3605,6 +3605,6 @@
     - dsv4-fp4-mi355x-atom
   description:
     - "Update image to rocm/atom:rocm7.2.4_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.4_20260612"
-    - "Enable --enable-tbo (Token-Bucket Overlap) on top of DPA+TP8 at high concurrency: ISL=1024/OSL=1024 at CONC>=1024, ISL=8192/OSL=1024 at CONC>=256"
+    - "Enable --enable-tbo (Two Batch Overlap) on top of DPA+TP8 at high concurrency: ISL=1024/OSL=1024 at CONC>=1024, ISL=8192/OSL=1024 at CONC>=256"
     - "Update ISL=8192 search-space: TP8-only from conc=4-64, DPA from conc=128-1024 (previously conc=1-64 and DPA conc=64-512)"
-  pr-link: 
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1717

From 65e0fa328cd93161c5e736cc9f2fe7f8f22aed16 Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Fri, 12 Jun 2026 21:59:22 +0900
Subject: [PATCH 4/5] [AMD] dsv4_fp4_mi355x_atom.sh: disable prefix caching

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh
index 4f4545824..369b72281 100644
--- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh
@@ -53,6 +53,7 @@ python3 -m atom.entrypoints.openai_server \
     --kv_cache_dtype fp8 \
     --trust-remote-code \
     --gpu-memory-utilization 0.85 \
+    --no-enable_prefix_caching \
     > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!

From 3f3560b7ef5b39461065bad07da780e063438313 Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Fri, 12 Jun 2026 22:17:00 +0900
Subject: [PATCH 5/5] [AMD] dsv4-fp4-mi355x-atom: add max-model-len, eval
 context, extend conc range

- Pass --max-model-len to server using SERVE_MAX_MODEL_LEN
- Add EVAL_ONLY path: compute eval context length via compute_eval_context_length
- Extend conc-end to 8192 (isl=1024) and 4096 (isl=8192) in amd-master.yaml

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml                      |  8 ++++----
 .../fixed_seq_len/dsv4_fp4_mi355x_atom.sh            | 12 ++++++++++--
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 77e4f0040..977f0ef2a 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -2272,17 +2272,17 @@ dsv4-fp4-mi355x-atom:
       search-space:
         # conc4-64, TP8
         # conc128-512, DPA
-        # conc1024, DPA TBO
+        # conc1024-8192, DPA TBO
       - { tp: 8, ep: 1, conc-start: 1, conc-end: 64 }
-      - { tp: 8, ep: 1, dp-attn: true, conc-start: 64, conc-end: 1024 }
+      - { tp: 8, ep: 1, dp-attn: true, conc-start: 64, conc-end: 8192 }
     - isl: 8192
       osl: 1024
       search-space:
         # conc4-64, TP8
         # conc128, DPA
-        # conc256-1024, DPA TBO
+        # conc256-4096, DPA TBO
       - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 }
-      - { tp: 8, ep: 1, dp-attn: true, conc-start: 128, conc-end: 1024 }
+      - { tp: 8, ep: 1, dp-attn: true, conc-start: 128, conc-end: 4096 }
 
 dsv4-fp4-mi355x-atom-mtp:
   image: rocm/atom:rocm7.2.4_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.3
diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh
index 369b72281..cfd4354b8 100644
--- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh
@@ -37,6 +37,15 @@ if [ "$DP_ATTENTION" = "true" ]; then
     fi
 fi 
 
+BENCHMARK_MAX_MODEL_LEN="$MAX_MODEL_LEN"
+
+if [ "${EVAL_ONLY}" = "true" ]; then
+    EVAL_MAX_MODEL_LEN=$(compute_eval_context_length "$MODEL" "$BENCHMARK_MAX_MODEL_LEN")
+    export EVAL_MAX_MODEL_LEN
+    SERVE_MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+else
+    SERVE_MAX_MODEL_LEN="$BENCHMARK_MAX_MODEL_LEN"
+fi
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
@@ -44,8 +53,6 @@ set -x
 export ATOM_DISABLE_MMAP=true
 export AITER_BF16_FP8_MOE_BOUND=0
 export ATOM_MOE_GU_ITLV=1
-# TODO: add --no-enable_chunked_prefill, when dsv4 prefix caching is supported 
-#https://github.com/ROCm/ATOM/commit/7df93a181da4d3c3250c2441c7d5e2745a03d0cd#diff-61b1ba0b8b74523530d2d5cdc739d4f3a23a43bedf69015a5235844d46e9373bL1127
 python3 -m atom.entrypoints.openai_server \
     --model $MODEL \
     --server-port $PORT \
@@ -54,6 +61,7 @@ python3 -m atom.entrypoints.openai_server \
     --trust-remote-code \
     --gpu-memory-utilization 0.85 \
     --no-enable_prefix_caching \
+    --max-model-len "$SERVE_MAX_MODEL_LEN" \
     > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!