ModelTC · sufubao · Jan 22, 2026 · Jan 22, 2026 · Jan 22, 2026 · Jan 22, 2026
diff --git a/last_bench/BENCHMARK_REPORT.md b/last_bench/BENCHMARK_REPORT.md
@@ -0,0 +1,123 @@
+# GLM-4.7-Flash Attention Backend Performance Report
+
+Generated from 63 benchmark measurements.
+
+**Backends compared**: triton, fa3, flashmla, flashinfer
+
+**Scenarios**:
+- 1000→1000: Short input, short output
+- 1000→8000: Short input, long output
+- 8000→1000: Long input, short output
+
+**Concurrency levels**: 10, 80, 320
+
+---
+
+## Executive Summary
+
+Best performing backend per scenario (highest output throughput):
+
+| Scenario | Best Backend | Max Throughput (tok/s) | @ Concurrency |
+|----------|--------------|------------------------|---------------|
+| 1000→1000 | **lightllm-flashinfer** | 2,768.4 | 320 |
+| 1000→8000 | **lightllm-flashinfer** | 2,684.0 | 320 |
+| 8000→1000 | **lightllm-fa3** | 1,985.7 | 320 |
+
+---
+
+## Detailed Results by Scenario
+
+### Scenario: 1000→1000 (Short Input → Short Output)
+
+| Concurrency | Backend    | QPS   | Throughput (tok/s) | TTFT (ms) | P99 TTFT (ms) | ITL (ms) | P99 ITL (ms) |
+|-------------|------------|-------|-------------------|-----------|---------------|----------|--------------|
+| 10          | triton     |  0.24 |             101.5 |     269.6 |        1261.0 |     9.20 |        10.18 |
+|             | fa3        |  0.26 |             110.9 |     105.9 |         176.0 |     8.75 |         8.96 |
+|             | flashmla   |  0.26 |             109.9 |     112.2 |         221.7 |     8.82 |         9.25 |
+|             | flashinfer |  0.28 |             117.3 |     113.2 |         164.7 |     8.25 |         8.50 |
+|             | lightllm-fa3 |  0.35 |             149.5 |      60.6 |          72.1 |     6.56 |         8.40 |
+|             | lightllm-flashinfer |  0.35 |             149.6 |      58.6 |          72.7 |     6.56 |         8.41 |
+|             | lightllm-triton |  0.35 |             146.4 |      54.3 |          70.3 |     6.72 |         8.42 |
+| 80          | triton     |  1.36 |             694.5 |     392.1 |        1561.8 |    19.45 |        90.76 |
+|             | fa3        |  1.50 |             764.0 |     153.1 |         337.5 |    18.00 |        88.61 |
+|             | flashmla   |  1.45 |             739.7 |     184.9 |         429.6 |    18.50 |        94.12 |
+|             | flashinfer |  1.67 |             853.5 |     166.6 |         383.4 |    16.12 |        91.41 |
+|             | lightllm-fa3 |  2.04 |           1,041.0 |      65.8 |         103.2 |    13.13 |        15.34 |
+|             | lightllm-flashinfer |  2.05 |           1,046.3 |      63.6 |         125.0 |    13.07 |        15.35 |
+|             | lightllm-triton |  1.73 |             880.7 |      67.6 |          97.6 |    15.79 |        18.57 |
+| 320         | triton     |  3.63 |           1,928.4 |     144.5 |         260.2 |    29.49 |        98.22 |
+|             | fa3        |  3.93 |           2,090.8 |     189.7 |         498.5 |    27.11 |        94.84 |
+|             | flashmla   |  3.76 |           2,000.3 |     269.5 |         868.0 |    28.32 |        99.13 |
+|             | flashinfer |  4.21 |           2,236.0 |     162.3 |         314.6 |    25.97 |        96.63 |
+|             | lightllm-fa3 |  5.02 |           2,668.5 |     158.7 |         430.5 |    21.16 |        80.35 |
+|             | lightllm-flashinfer |  5.21 |           2,768.4 |     137.9 |         531.4 |    20.33 |        94.61 |
+|             | lightllm-triton |  4.39 |           2,332.7 |     123.9 |         276.3 |    24.11 |        80.39 |
+
+### Scenario: 1000→8000 (Short Input → Long Output)
+
+| Concurrency | Backend    | QPS   | Throughput (tok/s) | TTFT (ms) | P99 TTFT (ms) | ITL (ms) | P99 ITL (ms) |
+|-------------|------------|-------|-------------------|-----------|---------------|----------|--------------|
+| 10          | triton     |  0.02 |              84.1 |     143.2 |         384.8 |    11.86 |        16.50 |
+|             | fa3        |  0.03 |             112.9 |     112.0 |         138.9 |     8.84 |         9.10 |
+|             | flashmla   |  0.02 |             100.4 |     164.7 |         664.4 |     9.93 |        11.77 |
+|             | flashinfer |  0.03 |             120.0 |     123.0 |         223.8 |     8.34 |         8.74 |
+|             | lightllm-fa3 |  0.03 |             149.8 |     206.3 |         593.4 |     6.64 |         8.45 |
+|             | lightllm-flashinfer |  0.03 |             147.9 |     154.0 |         230.5 |     6.73 |         8.45 |
+|             | lightllm-triton |  0.02 |             110.4 |     200.0 |         571.9 |     9.02 |        14.45 |
+| 80          | triton     |  0.16 |             623.7 |     250.5 |         880.0 |    22.17 |        27.21 |
+|             | fa3        |  0.21 |             840.0 |     214.5 |         720.5 |    16.77 |        17.85 |
+|             | flashmla   |  0.21 |             820.2 |     294.6 |        1089.6 |    17.01 |        18.01 |
+|             | flashinfer |  0.24 |             947.0 |     214.8 |         738.7 |    14.99 |        15.76 |
+|             | lightllm-fa3 |  0.26 |           1,054.0 |     468.2 |        2356.6 |    13.35 |        15.25 |
+|             | lightllm-flashinfer |  0.26 |           1,050.6 |     206.3 |         701.2 |    13.42 |        15.21 |
+|             | lightllm-triton |  0.17 |             677.4 |     167.8 |         393.4 |    20.18 |        23.16 |
+| 320         | triton     |  0.40 |           1,634.9 |     331.7 |        1511.1 |    34.20 |        75.71 |
+|             | fa3        |  0.57 |           2,310.4 |     284.6 |        1195.5 |    24.40 |        73.23 |
+|             | flashmla   |  0.58 |           2,367.8 |     347.4 |        1505.3 |    23.62 |        96.43 |
+|             | flashinfer |  0.64 |           2,615.2 |     289.3 |        1173.0 |    21.98 |        91.45 |
+|             | lightllm-fa3 |  0.65 |           2,660.1 |     170.5 |         400.1 |    21.42 |        76.17 |
+|             | lightllm-flashinfer |  0.66 |           2,684.0 |     213.3 |         557.1 |    21.18 |        94.81 |
+|             | lightllm-triton |  0.42 |           1,710.8 |     221.4 |         679.5 |    32.72 |        78.24 |
+
+### Scenario: 8000→1000 (Long Input → Short Output)
+
+| Concurrency | Backend    | QPS   | Throughput (tok/s) | TTFT (ms) | P99 TTFT (ms) | ITL (ms) | P99 ITL (ms) |
+|-------------|------------|-------|-------------------|-----------|---------------|----------|--------------|
+| 10          | triton     |  0.17 |              73.7 |     217.7 |         426.0 |    13.08 |        16.85 |
+|             | fa3        |  0.25 |             107.0 |     208.0 |         464.6 |     8.87 |         9.13 |
+|             | flashmla   |  0.22 |              93.8 |     188.3 |         448.2 |    10.24 |        11.91 |
+|             | flashinfer |  0.27 |             112.9 |     169.5 |         270.5 |     8.47 |         8.89 |
+|             | lightllm-fa3 |  0.34 |             142.7 |     151.8 |         276.2 |     6.67 |         8.47 |
+|             | lightllm-flashinfer |  0.33 |             140.7 |     139.2 |         242.7 |     6.79 |         8.49 |
+|             | lightllm-triton |  0.22 |              94.7 |     167.5 |         366.2 |    10.19 |        14.63 |
+| 80          | triton     |  0.87 |             454.3 |    1275.7 |        8538.7 |    29.22 |       199.05 |
+|             | fa3        |  1.34 |             697.6 |     451.7 |        2349.4 |    19.57 |       110.56 |
+|             | flashmla   |  1.31 |             682.1 |     538.0 |        2850.3 |    19.76 |       103.51 |
+|             | flashinfer |  1.43 |             745.5 |     453.6 |        2351.6 |    18.35 |       109.11 |
+|             | lightllm-fa3 |  1.76 |             919.2 |     181.4 |         734.0 |    15.08 |        88.50 |
+|             | lightllm-flashinfer |  1.76 |             914.9 |     184.9 |         661.3 |    15.13 |        93.04 |
+|             | lightllm-triton |  1.12 |             581.7 |     225.4 |         973.7 |    23.53 |       105.45 |
+| 320         | triton     |  1.50 |             795.5 |    3873.7 |       30295.7 |    68.73 |       464.47 |
+|             | fa3        |  2.74 |           1,453.1 |    1138.1 |        8315.6 |    38.80 |       196.49 |
+|             | flashmla   |  2.84 |           1,506.9 |    1201.1 |        8968.1 |    37.03 |       181.73 |
+|             | flashinfer |  2.85 |           1,515.0 |    1119.6 |        8247.4 |    37.54 |       195.69 |
+|             | lightllm-fa3 |  3.74 |           1,985.7 |     277.2 |         673.9 |    29.14 |       147.19 |
+|             | lightllm-flashinfer |  3.70 |           1,964.2 |     273.3 |         717.0 |    29.49 |       134.72 |
+|             | lightllm-triton |  2.36 |           1,254.9 |     313.2 |         717.6 |    46.08 |       199.79 |
+
+---
+
+## Key Findings
+
+1. **Highest Throughput**: lightllm-flashinfer achieves 2,768.4 tok/s on 1000→1000 @ concurrency 320
+2. **Lowest TTFT**: lightllm-triton achieves 54.3ms on 1000→1000 @ concurrency 10
+3. **Lowest ITL**: lightllm-fa3 achieves 6.56ms on 1000→1000 @ concurrency 10
+
+### Concurrency Scaling (1000→8000 scenario)
+
+| Backend | 10 conc | 80 conc | 320 conc | Scale Factor |
+|---------|---------|---------|----------|--------------|
+| triton   |    84.1 |   623.7 |  1,634.9 |         19.4x |
+| fa3      |   112.9 |   840.0 |  2,310.4 |         20.5x |
+| flashmla |   100.4 |   820.2 |  2,367.8 |         23.6x |
+| flashinfer |   120.0 |   947.0 |  2,615.2 |         21.8x |
diff --git a/last_bench/bench.sh b/last_bench/bench.sh
@@ -0,0 +1,56 @@
+#!/usr/bin/env bash
+set -euo pipefail
+IFS=$'\n\t'
+
+log() { printf '%s - %s\n' "$(date -u +"%Y-%m-%dT%H:%M:%SZ")" "$*" >&2; }
+
+input_len=(1000 8000)
+output_len=(8000 1000)
+num_prompts=(10 80 320)
+max_concurrencys=(1 16 64)
+tags=("triton" "fa3" "flashmla" "flashinfer")
+ports=(30000 30001 30002 30003)
+
+# Model path can be overridden by env var MODEL_PATH
+MODEL_PATH=${MODEL_PATH:-/dev/shm/GLM-4.7-Flash}
+if [ ! -e "$MODEL_PATH" ]; then
+  log "Warning: model path '$MODEL_PATH' does not exist. Proceeding anyway."
+else
+  log "Using model path: $MODEL_PATH"
+fi
+
+if ! command -v python >/dev/null 2>&1; then
+  log "Error: python not found in PATH"
+  exit 1
+fi
+
+count1=${#output_len[@]}
+for ((i=0; i<count1; i++)); do
+  count2=${#num_prompts[@]}
+  for ((j=0; j<count2; j++)); do
+    count3=${#tags[@]}
+    for ((z=0; z<count3; z++)); do
+      PORT=${ports[$z]}
+      TAG=${tags[$z]}
+      log "Starting benchmark: input_len=${input_len[$i]} output_len=${output_len[$i]} num_prompts=${num_prompts[$j]} max_concurrency=${max_concurrencys[$j]} port=$PORT tag=$TAG"
+      if python -m sglang.bench_serving \
+        --backend sglang-oai \
+        --model "$MODEL_PATH" \
+        --dataset-name random \
+        --random-input-len "${input_len[$i]}" \
+        --random-output-len "${output_len[$i]}" \
+        --num-prompts "${num_prompts[$j]}" \
+        --max-concurrency "${max_concurrencys[$j]}" \
+        --request-rate inf \
+        --flush-cache \
+        --port "$PORT" \
+        --tag "$TAG"; then
+        log "Benchmark succeeded for port=$PORT tag=$TAG"
+      else
+        rc=$?
+        log "Benchmark FAILED (rc=$rc) for port=$PORT tag=$TAG"
+      fi
+      sleep 1
+    done
+  done
+done
diff --git a/last_bench/bench_lightllm.sh b/last_bench/bench_lightllm.sh
@@ -0,0 +1,56 @@
+#!/usr/bin/env bash
+set -euo pipefail
+IFS=$'\n\t'
+
+log() { printf '%s - %s\n' "$(date -u +"%Y-%m-%dT%H:%M:%SZ")" "$*" >&2; }
+
+input_len=(1000)
+output_len=(1000)
+num_prompts=(10 80 320)
+max_concurrencys=(1 16 64)
+tags=("lightllm-fa3" "lightllm-flashinfer" "lightllm-triton")
+ports=(24000 24001 24002)
+
+# Model path can be overridden by env var MODEL_PATH
+MODEL_PATH=${MODEL_PATH:-/dev/shm/GLM-4.7-Flash}
+if [ ! -e "$MODEL_PATH" ]; then
+  log "Warning: model path '$MODEL_PATH' does not exist. Proceeding anyway."
+else
+  log "Using model path: $MODEL_PATH"
+fi
+
+if ! command -v python >/dev/null 2>&1; then
+  log "Error: python not found in PATH"
+  exit 1
+fi
+
+count1=${#output_len[@]}
+for ((i=0; i<count1; i++)); do
+  count2=${#num_prompts[@]}
+  for ((j=0; j<count2; j++)); do
+    count3=${#tags[@]}
+    for ((z=0; z<count3; z++)); do
+      PORT=${ports[$z]}
+      TAG=${tags[$z]}
+      log "Starting benchmark: input_len=${input_len[$i]} output_len=${output_len[$i]} num_prompts=${num_prompts[$j]} max_concurrency=${max_concurrencys[$j]} port=$PORT tag=$TAG"
+      if python -m sglang.bench_serving \
+        --backend sglang-oai \
+        --model "$MODEL_PATH" \
+        --dataset-name random \
+        --random-input-len "${input_len[$i]}" \
+        --random-output-len "${output_len[$i]}" \
+        --num-prompts "${num_prompts[$j]}" \
+        --max-concurrency "${max_concurrencys[$j]}" \
+        --request-rate inf \
+        --flush-cache \
+        --port "$PORT" \
+        --tag "$TAG"; then
+        log "Benchmark succeeded for port=$PORT tag=$TAG"
+      else
+        rc=$?
+        log "Benchmark FAILED (rc=$rc) for port=$PORT tag=$TAG"
+      fi
+      sleep 1
+    done
+  done
+done