SemiAnalysisAI · cquil11 · Apr 1, 2026 · Apr 1, 2026 · Apr 1, 2026 · Apr 1, 2026
diff --git a/.github/configs/multiturn-agentic-trace.yaml b/.github/configs/multiturn-agentic-trace.yaml
@@ -0,0 +1,58 @@
+h200-fp8-llama70b:
+  tp2:
+    users: [2, 4, 6, 8, 10, 12, 16, 20, 24, 32]
+    offload: ["on", "off"]
+  tp4:
+    users: [2, 4, 6, 8, 16, 24, 32, 40, 48, 56]
+    offload: ["on", "off"]
+  tp8:
+    users: [2, 4, 6, 8, 16, 32, 48, 64, 80, 128, 256]
+    offload: ["on", "off"]
+
+mi355x-fp8-llama70b:
+  tp2:
+    users: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56]
+    offload: ["on", "off"]
+  tp4:
+    users: [1, 2, 4, 8, 16, 32, 48, 64, 80, 96, 112, 256]
+    offload: ["on", "off"]
+  tp8:
+    users: [1, 2, 4, 8, 16, 32, 64, 96, 128, 160, 256, 512]
+    offload: ["on", "off"]
+
+h200-fp8-llama70b-lmcache:
+  tp2:
+    users: [1, 2, 4, 6, 8, 10, 12, 16, 20, 32, 48, 64, 80, 96, 128]
+    offload: ["on", "off"]
+  tp4:
+    users: [1, 2, 4, 8, 12, 16, 20, 24, 32, 40, 64, 80, 96, 128, 160, 192]
+    offload: ["on", "off"]
+  tp8:
+    users: [1, 2, 4, 8, 16, 24, 32, 48, 64, 128, 256, 384, 512, 768, 1024]
+    offload: ["on", "off"]
+
+h100-fp8-llama70b-lmcache:
+  tp2:
+    users: [1, 2, 4, 6, 8, 10, 12, 16, 20, 32, 64]
+    offload: ["on", "off"]
+  tp4:
+    users: [1, 2, 4, 8, 12, 16, 20, 24, 32, 40, 64, 128]
+    offload: ["on", "off"]
+  tp8:
+    users: [1, 2, 4, 8, 16, 24, 32, 48, 64, 128, 256]
+    offload: ["on", "off"]
+
+b200-fp4-dsr1-weka-trace:
+  tp4:
+    ep: 4
+    users: [1, 2, 4, 8, 12, 16, 24, 32, 48, 64, 128, 256]
+    offload: ["on", "off"]
+  tp8:
+    ep: 8
+    users: [1, 2, 4, 8, 12, 16, 32, 64, 128, 256, 512]
+    offload: ["on", "off"]
+
+mi355x-fp4-dsr1-weka-trace:
+  tp8:
+    users: [1, 2, 4, 8, 12, 16, 32, 64, 128, 256, 512]
+    offload: ["on", "off"]
diff --git a/.github/workflows/benchmark-multiturn-tmpl.yml b/.github/workflows/benchmark-multiturn-tmpl.yml
@@ -0,0 +1,192 @@
+name: Template - Multi-Turn Benchmark
+on:
+  workflow_call:
+    inputs:
+      runner:
+        required: true
+        type: string
+      image:
+        required: true
+        type: string
+      model:
+        required: true
+        type: string
+      precision:
+        required: false
+        type: string
+        default: 'fp4'
+      exp-name:
+        required: true
+        type: string
+      tp:
+        required: true
+        type: string
+      users:
+        required: true
+        type: string
+      offload-mode:
+        description: "on = prefix+offload, off = prefix only, noprefix = no prefix caching"
+        required: true
+        type: string
+      duration:
+        required: false
+        type: string
+        default: ''
+      request-rate:
+        description: "Request rate per client (Poisson, req/s). 0 = no delay."
+        required: false
+        type: string
+        default: '0'
+      total-cpu-dram-gb:
+        required: false
+        type: string
+        default: '300'
+      script-suffix:
+        description: "Suffix appended to benchmark script name (e.g. '_lmcache')"
+        required: false
+        type: string
+        default: ''
+      ep:
+        description: "Expert parallelism size (for MoE models)"
+        required: false
+        type: string
+        default: '0'
+      ref:
+        description: "Git ref (branch/sha) to checkout"
+        required: false
+        type: string
+      ignore-eos:
+        description: "Ignore EOS token and force exact output_tokens from trace"
+        required: false
+        type: string
+        default: 'false'
+      trace-dir:
+        description: "Override trace directory (relative to kv-cache-tester dir)"
+        required: false
+        type: string
+        default: ''
+
+env:
+  HF_TOKEN: ${{ secrets.HF_TOKEN }}
+  HF_HUB_CACHE: '/mnt/hf_hub_cache/'
+  EXP_NAME: ${{ inputs.exp-name }}
+  MODEL: ${{ inputs.model }}
+  IMAGE: ${{ inputs.image }}
+  PRECISION: ${{ inputs.precision }}
+  FRAMEWORK: 'vllm'
+  TP: ${{ inputs.tp }}
+  EP_SIZE: ${{ inputs.ep }}
+  USERS: ${{ inputs.users }}
+  OFFLOAD_MODE: ${{ inputs.offload-mode }}
+  DURATION: ${{ inputs.duration }}
+  REQUEST_RATE: ${{ inputs.request-rate }}
+  TOTAL_CPU_DRAM_GB: ${{ inputs.total-cpu-dram-gb }}
+  SCRIPT_SUFFIX: ${{ inputs.script-suffix }}
+  SPEC_DECODING: 'off'
+  IGNORE_EOS: ${{ inputs.ignore-eos }}
+  TRACE_DIR: ${{ inputs.trace-dir }}
+
+permissions:
+  contents: read
+
+jobs:
+  benchmark:
+    runs-on: ${{ inputs.runner }}
+    timeout-minutes: 180
+    name: "${{ inputs.exp-name }} tp=${{ inputs.tp }} users=${{ inputs.users }} offload=${{ inputs.offload-mode }}"
+    steps:
+      - name: Resource cleanup (pre-run)
+        run: &resource-cleanup |
+          # Cleanup Docker resources
+          if command -v docker >/dev/null 2>&1 && docker info >/dev/null 2>&1; then
+            echo "[Docker] Cleaning up resources ..."
+            docker ps -aq | xargs -r docker rm -f
+            docker network prune -f
+            while [ -n "$(docker ps -aq)" ]; do
+              docker ps -a
+              sleep 5
+            done
+          fi
+
+          # Cleanup SLURM resources
+          if command -v squeue >/dev/null 2>&1; then
+            if [[ "${{ runner.name }}" == mi355x-amds* || "${{ runner.name }}" == mi325x-amd* || "${{ runner.name }}" == mi300x-amds* || "${{ runner.name }}" == gb200-nv* || "${{ runner.name }}" == gb300-nv* || "${{ runner.name }}" == h100-cw* || "${{ runner.name }}" == h200-cw* || "${{ runner.name }}" == b200-nb* || "${{ runner.name }}" == h200-nb* || "${{ runner.name }}" == h100-dgxc-slurm* || "${{ runner.name }}" == h200-dgxc-slurm* || "${{ runner.name }}" == b200-dgxc-slurm* ]]; then
+              echo "[Slurm] Cleaning up jobs with name: ${{ runner.name }} ..."
+              scancel --name="${{ runner.name }}" || true
+              while [ -n "$(squeue --name='${{ runner.name }}' --noheader --format='%i')" ]; do
+                squeue --name="${{ runner.name }}"
+                sleep 5
+              done
+            else
+              echo "[Slurm] Cleaning up jobs for user: $USER ..."
+              scancel -u "$USER" || true
+              while [ -n "$(squeue -u "$USER" --noheader --format='%i')" ]; do
+                squeue -u "$USER"
+                sleep 5
+              done
+            fi
+          fi
+
+      - name: Clean stale git locks
+        run: find . -name 'index.lock' -delete 2>/dev/null || true
+
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        with:
+          token: ${{ secrets.REPO_PAT }}
+          fetch-depth: 0
+          ref: ${{ inputs.ref || github.sha }}
+          submodules: true
+
+      - name: Launch job script
+        env:
+          RUNNER_NAME: ${{ runner.name }}
+          RESULT_DIR: /workspace/results
+        run: |
+          bash ./runners/launch_${RUNNER_NAME%%_*}.sh
+
+          # The runner script doesn't propagate exit codes (scancel masks them).
+          # Check status.txt to determine if the benchmark actually succeeded.
+          if [ ! -f results/status.txt ]; then
+            echo "Run failed: results/status.txt not found." >&2
+            exit 1
+          fi
+          STATUS=$(cat results/status.txt)
+          if [ "$STATUS" != "SUCCESS" ]; then
+            echo "Run failed: status=$STATUS" >&2
+            cat results/benchmark.log 2>/dev/null || true
+            exit 1
+          fi
+
+      - name: Upload results
+        if: always()
+        uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
+        with:
+          name: "multiturn_tp${{ inputs.tp }}_users${{ inputs.users }}_offload${{ inputs.offload-mode }}"
+          path: |
+            results/metrics_client_metrics.csv
+            results/metrics_server_metrics.csv
+            results/metrics_plots.png
+            results/benchmark.log
+            results/config.yaml
+            results/vllm_command.txt
+            results/benchmark_command.txt
+            results/benchmark_metadata.json
+            results/metrics_workload.png
+            results/aiperf_artifacts/profile_export_aiperf.csv
+            results/workload_distribution_summary.txt
+            results/workload_distribution_plots.png
+            results/trace_replay/detailed_results.csv
+            results/status.txt
+          if-no-files-found: ignore
+
+      - name: Upload server logs
+        if: always()
+        uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
+        with:
+          name: "server_logs_tp${{ inputs.tp }}_users${{ inputs.users }}_offload${{ inputs.offload-mode }}"
+          path: results/server.log
+          if-no-files-found: ignore
+
+      - name: Resource cleanup (post-run)
+        if: always()
+        run: *resource-cleanup