diff --git a/last_bench/BENCHMARK_REPORT.md b/last_bench/BENCHMARK_REPORT.md
new file mode 100644
index 000000000..834490c5f
--- /dev/null
+++ b/last_bench/BENCHMARK_REPORT.md
@@ -0,0 +1,123 @@
+# GLM-4.7-Flash Attention Backend Performance Report
+
+Generated from 63 benchmark measurements.
+
+**Backends compared**: triton, fa3, flashmla, flashinfer
+
+**Scenarios**:
+- 1000→1000: Short input, short output
+- 1000→8000: Short input, long output
+- 8000→1000: Long input, short output
+
+**Concurrency levels**: 10, 80, 320
+
+---
+
+## Executive Summary
+
+Best performing backend per scenario (highest output throughput):
+
+| Scenario | Best Backend | Max Throughput (tok/s) | @ Concurrency |
+|----------|--------------|------------------------|---------------|
+| 1000→1000 | **lightllm-flashinfer** | 2,768.4 | 320 |
+| 1000→8000 | **lightllm-flashinfer** | 2,684.0 | 320 |
+| 8000→1000 | **lightllm-fa3** | 1,985.7 | 320 |
+
+---
+
+## Detailed Results by Scenario
+
+### Scenario: 1000→1000 (Short Input → Short Output)
+
+| Concurrency | Backend    | QPS   | Throughput (tok/s) | TTFT (ms) | P99 TTFT (ms) | ITL (ms) | P99 ITL (ms) |
+|-------------|------------|-------|-------------------|-----------|---------------|----------|--------------|
+| 10          | triton     |  0.24 |             101.5 |     269.6 |        1261.0 |     9.20 |        10.18 |
+|             | fa3        |  0.26 |             110.9 |     105.9 |         176.0 |     8.75 |         8.96 |
+|             | flashmla   |  0.26 |             109.9 |     112.2 |         221.7 |     8.82 |         9.25 |
+|             | flashinfer |  0.28 |             117.3 |     113.2 |         164.7 |     8.25 |         8.50 |
+|             | lightllm-fa3 |  0.35 |             149.5 |      60.6 |          72.1 |     6.56 |         8.40 |
+|             | lightllm-flashinfer |  0.35 |             149.6 |      58.6 |          72.7 |     6.56 |         8.41 |
+|             | lightllm-triton |  0.35 |             146.4 |      54.3 |          70.3 |     6.72 |         8.42 |
+| 80          | triton     |  1.36 |             694.5 |     392.1 |        1561.8 |    19.45 |        90.76 |
+|             | fa3        |  1.50 |             764.0 |     153.1 |         337.5 |    18.00 |        88.61 |
+|             | flashmla   |  1.45 |             739.7 |     184.9 |         429.6 |    18.50 |        94.12 |
+|             | flashinfer |  1.67 |             853.5 |     166.6 |         383.4 |    16.12 |        91.41 |
+|             | lightllm-fa3 |  2.04 |           1,041.0 |      65.8 |         103.2 |    13.13 |        15.34 |
+|             | lightllm-flashinfer |  2.05 |           1,046.3 |      63.6 |         125.0 |    13.07 |        15.35 |
+|             | lightllm-triton |  1.73 |             880.7 |      67.6 |          97.6 |    15.79 |        18.57 |
+| 320         | triton     |  3.63 |           1,928.4 |     144.5 |         260.2 |    29.49 |        98.22 |
+|             | fa3        |  3.93 |           2,090.8 |     189.7 |         498.5 |    27.11 |        94.84 |
+|             | flashmla   |  3.76 |           2,000.3 |     269.5 |         868.0 |    28.32 |        99.13 |
+|             | flashinfer |  4.21 |           2,236.0 |     162.3 |         314.6 |    25.97 |        96.63 |
+|             | lightllm-fa3 |  5.02 |           2,668.5 |     158.7 |         430.5 |    21.16 |        80.35 |
+|             | lightllm-flashinfer |  5.21 |           2,768.4 |     137.9 |         531.4 |    20.33 |        94.61 |
+|             | lightllm-triton |  4.39 |           2,332.7 |     123.9 |         276.3 |    24.11 |        80.39 |
+
+### Scenario: 1000→8000 (Short Input → Long Output)
+
+| Concurrency | Backend    | QPS   | Throughput (tok/s) | TTFT (ms) | P99 TTFT (ms) | ITL (ms) | P99 ITL (ms) |
+|-------------|------------|-------|-------------------|-----------|---------------|----------|--------------|
+| 10          | triton     |  0.02 |              84.1 |     143.2 |         384.8 |    11.86 |        16.50 |
+|             | fa3        |  0.03 |             112.9 |     112.0 |         138.9 |     8.84 |         9.10 |
+|             | flashmla   |  0.02 |             100.4 |     164.7 |         664.4 |     9.93 |        11.77 |
+|             | flashinfer |  0.03 |             120.0 |     123.0 |         223.8 |     8.34 |         8.74 |
+|             | lightllm-fa3 |  0.03 |             149.8 |     206.3 |         593.4 |     6.64 |         8.45 |
+|             | lightllm-flashinfer |  0.03 |             147.9 |     154.0 |         230.5 |     6.73 |         8.45 |
+|             | lightllm-triton |  0.02 |             110.4 |     200.0 |         571.9 |     9.02 |        14.45 |
+| 80          | triton     |  0.16 |             623.7 |     250.5 |         880.0 |    22.17 |        27.21 |
+|             | fa3        |  0.21 |             840.0 |     214.5 |         720.5 |    16.77 |        17.85 |
+|             | flashmla   |  0.21 |             820.2 |     294.6 |        1089.6 |    17.01 |        18.01 |
+|             | flashinfer |  0.24 |             947.0 |     214.8 |         738.7 |    14.99 |        15.76 |
+|             | lightllm-fa3 |  0.26 |           1,054.0 |     468.2 |        2356.6 |    13.35 |        15.25 |
+|             | lightllm-flashinfer |  0.26 |           1,050.6 |     206.3 |         701.2 |    13.42 |        15.21 |
+|             | lightllm-triton |  0.17 |             677.4 |     167.8 |         393.4 |    20.18 |        23.16 |
+| 320         | triton     |  0.40 |           1,634.9 |     331.7 |        1511.1 |    34.20 |        75.71 |
+|             | fa3        |  0.57 |           2,310.4 |     284.6 |        1195.5 |    24.40 |        73.23 |
+|             | flashmla   |  0.58 |           2,367.8 |     347.4 |        1505.3 |    23.62 |        96.43 |
+|             | flashinfer |  0.64 |           2,615.2 |     289.3 |        1173.0 |    21.98 |        91.45 |
+|             | lightllm-fa3 |  0.65 |           2,660.1 |     170.5 |         400.1 |    21.42 |        76.17 |
+|             | lightllm-flashinfer |  0.66 |           2,684.0 |     213.3 |         557.1 |    21.18 |        94.81 |
+|             | lightllm-triton |  0.42 |           1,710.8 |     221.4 |         679.5 |    32.72 |        78.24 |
+
+### Scenario: 8000→1000 (Long Input → Short Output)
+
+| Concurrency | Backend    | QPS   | Throughput (tok/s) | TTFT (ms) | P99 TTFT (ms) | ITL (ms) | P99 ITL (ms) |
+|-------------|------------|-------|-------------------|-----------|---------------|----------|--------------|
+| 10          | triton     |  0.17 |              73.7 |     217.7 |         426.0 |    13.08 |        16.85 |
+|             | fa3        |  0.25 |             107.0 |     208.0 |         464.6 |     8.87 |         9.13 |
+|             | flashmla   |  0.22 |              93.8 |     188.3 |         448.2 |    10.24 |        11.91 |
+|             | flashinfer |  0.27 |             112.9 |     169.5 |         270.5 |     8.47 |         8.89 |
+|             | lightllm-fa3 |  0.34 |             142.7 |     151.8 |         276.2 |     6.67 |         8.47 |
+|             | lightllm-flashinfer |  0.33 |             140.7 |     139.2 |         242.7 |     6.79 |         8.49 |
+|             | lightllm-triton |  0.22 |              94.7 |     167.5 |         366.2 |    10.19 |        14.63 |
+| 80          | triton     |  0.87 |             454.3 |    1275.7 |        8538.7 |    29.22 |       199.05 |
+|             | fa3        |  1.34 |             697.6 |     451.7 |        2349.4 |    19.57 |       110.56 |
+|             | flashmla   |  1.31 |             682.1 |     538.0 |        2850.3 |    19.76 |       103.51 |
+|             | flashinfer |  1.43 |             745.5 |     453.6 |        2351.6 |    18.35 |       109.11 |
+|             | lightllm-fa3 |  1.76 |             919.2 |     181.4 |         734.0 |    15.08 |        88.50 |
+|             | lightllm-flashinfer |  1.76 |             914.9 |     184.9 |         661.3 |    15.13 |        93.04 |
+|             | lightllm-triton |  1.12 |             581.7 |     225.4 |         973.7 |    23.53 |       105.45 |
+| 320         | triton     |  1.50 |             795.5 |    3873.7 |       30295.7 |    68.73 |       464.47 |
+|             | fa3        |  2.74 |           1,453.1 |    1138.1 |        8315.6 |    38.80 |       196.49 |
+|             | flashmla   |  2.84 |           1,506.9 |    1201.1 |        8968.1 |    37.03 |       181.73 |
+|             | flashinfer |  2.85 |           1,515.0 |    1119.6 |        8247.4 |    37.54 |       195.69 |
+|             | lightllm-fa3 |  3.74 |           1,985.7 |     277.2 |         673.9 |    29.14 |       147.19 |
+|             | lightllm-flashinfer |  3.70 |           1,964.2 |     273.3 |         717.0 |    29.49 |       134.72 |
+|             | lightllm-triton |  2.36 |           1,254.9 |     313.2 |         717.6 |    46.08 |       199.79 |
+
+---
+
+## Key Findings
+
+1. **Highest Throughput**: lightllm-flashinfer achieves 2,768.4 tok/s on 1000→1000 @ concurrency 320
+2. **Lowest TTFT**: lightllm-triton achieves 54.3ms on 1000→1000 @ concurrency 10
+3. **Lowest ITL**: lightllm-fa3 achieves 6.56ms on 1000→1000 @ concurrency 10
+
+### Concurrency Scaling (1000→8000 scenario)
+
+| Backend | 10 conc | 80 conc | 320 conc | Scale Factor |
+|---------|---------|---------|----------|--------------|
+| triton   |    84.1 |   623.7 |  1,634.9 |         19.4x |
+| fa3      |   112.9 |   840.0 |  2,310.4 |         20.5x |
+| flashmla |   100.4 |   820.2 |  2,367.8 |         23.6x |
+| flashinfer |   120.0 |   947.0 |  2,615.2 |         21.8x |
diff --git a/last_bench/bench.sh b/last_bench/bench.sh
new file mode 100644
index 000000000..54aba7883
--- /dev/null
+++ b/last_bench/bench.sh
@@ -0,0 +1,56 @@
+#!/usr/bin/env bash
+set -euo pipefail
+IFS=$'\n\t'
+
+log() { printf '%s - %s\n' "$(date -u +"%Y-%m-%dT%H:%M:%SZ")" "$*" >&2; }
+
+input_len=(1000 8000)
+output_len=(8000 1000)
+num_prompts=(10 80 320)
+max_concurrencys=(1 16 64)
+tags=("triton" "fa3" "flashmla" "flashinfer")
+ports=(30000 30001 30002 30003)
+
+# Model path can be overridden by env var MODEL_PATH
+MODEL_PATH=${MODEL_PATH:-/dev/shm/GLM-4.7-Flash}
+if [ ! -e "$MODEL_PATH" ]; then
+  log "Warning: model path '$MODEL_PATH' does not exist. Proceeding anyway."
+else
+  log "Using model path: $MODEL_PATH"
+fi
+
+if ! command -v python >/dev/null 2>&1; then
+  log "Error: python not found in PATH"
+  exit 1
+fi
+
+count1=${#output_len[@]}
+for ((i=0; i<count1; i++)); do
+  count2=${#num_prompts[@]}
+  for ((j=0; j<count2; j++)); do
+    count3=${#tags[@]}
+    for ((z=0; z<count3; z++)); do
+      PORT=${ports[$z]}
+      TAG=${tags[$z]}
+      log "Starting benchmark: input_len=${input_len[$i]} output_len=${output_len[$i]} num_prompts=${num_prompts[$j]} max_concurrency=${max_concurrencys[$j]} port=$PORT tag=$TAG"
+      if python -m sglang.bench_serving \
+        --backend sglang-oai \
+        --model "$MODEL_PATH" \
+        --dataset-name random \
+        --random-input-len "${input_len[$i]}" \
+        --random-output-len "${output_len[$i]}" \
+        --num-prompts "${num_prompts[$j]}" \
+        --max-concurrency "${max_concurrencys[$j]}" \
+        --request-rate inf \
+        --flush-cache \
+        --port "$PORT" \
+        --tag "$TAG"; then
+        log "Benchmark succeeded for port=$PORT tag=$TAG"
+      else
+        rc=$?
+        log "Benchmark FAILED (rc=$rc) for port=$PORT tag=$TAG"
+      fi
+      sleep 1
+    done
+  done
+done
\ No newline at end of file
diff --git a/last_bench/bench_lightllm.sh b/last_bench/bench_lightllm.sh
new file mode 100644
index 000000000..59d3abf34
--- /dev/null
+++ b/last_bench/bench_lightllm.sh
@@ -0,0 +1,56 @@
+#!/usr/bin/env bash
+set -euo pipefail
+IFS=$'\n\t'
+
+log() { printf '%s - %s\n' "$(date -u +"%Y-%m-%dT%H:%M:%SZ")" "$*" >&2; }
+
+input_len=(1000)
+output_len=(1000)
+num_prompts=(10 80 320)
+max_concurrencys=(1 16 64)
+tags=("lightllm-fa3" "lightllm-flashinfer" "lightllm-triton")
+ports=(24000 24001 24002)
+
+# Model path can be overridden by env var MODEL_PATH
+MODEL_PATH=${MODEL_PATH:-/dev/shm/GLM-4.7-Flash}
+if [ ! -e "$MODEL_PATH" ]; then
+  log "Warning: model path '$MODEL_PATH' does not exist. Proceeding anyway."
+else
+  log "Using model path: $MODEL_PATH"
+fi
+
+if ! command -v python >/dev/null 2>&1; then
+  log "Error: python not found in PATH"
+  exit 1
+fi
+
+count1=${#output_len[@]}
+for ((i=0; i<count1; i++)); do
+  count2=${#num_prompts[@]}
+  for ((j=0; j<count2; j++)); do
+    count3=${#tags[@]}
+    for ((z=0; z<count3; z++)); do
+      PORT=${ports[$z]}
+      TAG=${tags[$z]}
+      log "Starting benchmark: input_len=${input_len[$i]} output_len=${output_len[$i]} num_prompts=${num_prompts[$j]} max_concurrency=${max_concurrencys[$j]} port=$PORT tag=$TAG"
+      if python -m sglang.bench_serving \
+        --backend sglang-oai \
+        --model "$MODEL_PATH" \
+        --dataset-name random \
+        --random-input-len "${input_len[$i]}" \
+        --random-output-len "${output_len[$i]}" \
+        --num-prompts "${num_prompts[$j]}" \
+        --max-concurrency "${max_concurrencys[$j]}" \
+        --request-rate inf \
+        --flush-cache \
+        --port "$PORT" \
+        --tag "$TAG"; then
+        log "Benchmark succeeded for port=$PORT tag=$TAG"
+      else
+        rc=$?
+        log "Benchmark FAILED (rc=$rc) for port=$PORT tag=$TAG"
+      fi
+      sleep 1
+    done
+  done
+done
\ No newline at end of file
diff --git a/last_bench/generate_report.py b/last_bench/generate_report.py
new file mode 100644
index 000000000..5dce7d427
--- /dev/null
+++ b/last_bench/generate_report.py
@@ -0,0 +1,284 @@
+#!/usr/bin/env python3
+"""
+GLM-4.7-Flash Attention Backend Benchmark Report Generator
+
+Generates a Markdown performance comparison report from JSONL benchmark data.
+Compares 4 attention backends (triton, fa3, flashmla, flashinfer) across
+different scenarios and concurrency levels.
+"""
+
+import json
+import os
+from collections import defaultdict
+from dataclasses import dataclass
+from pathlib import Path
+
+
+@dataclass
+class BenchmarkResult:
+    """Single benchmark measurement."""
+
+    tag: str  # Backend name
+    concurrency: int
+    input_len: int
+    output_len: int
+    request_throughput: float  # QPS
+    output_throughput: float
+    mean_ttft_ms: float
+    p99_ttft_ms: float
+    mean_itl_ms: float
+    p99_itl_ms: float
+
+
+def parse_filename(filename: str) -> tuple[int, int, int] | None:
+    """Extract concurrency, input_len, output_len from filename.
+
+    Pattern: sglang-oai_0123_{concurrency}_{input}_{output}.jsonl
+    """
+    if not filename.endswith(".jsonl"):
+        return None
+    parts = filename.replace(".jsonl", "").split("_")
+    if len(parts) < 5:
+        return None
+    try:
+        concurrency = int(parts[2])
+        input_len = int(parts[3])
+        output_len = int(parts[4])
+        return concurrency, input_len, output_len
+    except (ValueError, IndexError):
+        return None
+
+
+def load_benchmark_data(data_dir: Path) -> list[BenchmarkResult]:
+    """Load all benchmark results from JSONL files."""
+    results = []
+
+    for filepath in data_dir.glob("sglang-oai_*.jsonl"):
+        parsed = parse_filename(filepath.name)
+        if not parsed:
+            continue
+        concurrency, input_len, output_len = parsed
+
+        with open(filepath) as f:
+            for line in f:
+                data = json.loads(line)
+                results.append(
+                    BenchmarkResult(
+                        tag=data["tag"],
+                        concurrency=concurrency,
+                        input_len=input_len,
+                        output_len=output_len,
+                        request_throughput=data["request_throughput"],
+                        output_throughput=data["output_throughput"],
+                        mean_ttft_ms=data["mean_ttft_ms"],
+                        p99_ttft_ms=data["p99_ttft_ms"],
+                        mean_itl_ms=data["mean_itl_ms"],
+                        p99_itl_ms=data["p99_itl_ms"],
+                    )
+                )
+
+    return results
+
+
+def group_by_scenario(results: list[BenchmarkResult]) -> dict[str, list[BenchmarkResult]]:
+    """Group results by scenario (input_len -> output_len)."""
+    grouped = defaultdict(list)
+    for r in results:
+        key = f"{r.input_len}→{r.output_len}"
+        grouped[key].append(r)
+    return grouped
+
+
+def find_best_backend(results: list[BenchmarkResult]) -> tuple[str, float]:
+    """Find the backend with highest throughput for given results."""
+    best = max(results, key=lambda r: r.output_throughput)
+    return best.tag, best.output_throughput
+
+
+def generate_executive_summary(scenarios: dict[str, list[BenchmarkResult]]) -> str:
+    """Generate executive summary table."""
+    lines = [
+        "## Executive Summary",
+        "",
+        "Best performing backend per scenario (highest output throughput):",
+        "",
+        "| Scenario | Best Backend | Max Throughput (tok/s) | @ Concurrency |",
+        "|----------|--------------|------------------------|---------------|",
+    ]
+
+    scenario_order = ["1000→1000", "1000→8000", "8000→1000"]
+    for scenario in scenario_order:
+        if scenario not in scenarios:
+            continue
+        results = scenarios[scenario]
+        best = max(results, key=lambda r: r.output_throughput)
+        lines.append(f"| {scenario} | **{best.tag}** | {best.output_throughput:,.1f} | {best.concurrency} |")
+
+    lines.append("")
+    return "\n".join(lines)
+
+
+def generate_scenario_table(scenario: str, results: list[BenchmarkResult]) -> str:
+    """Generate detailed comparison table for a scenario."""
+    # Scenario descriptions
+    scenario_desc = {
+        "1000→1000": "Short Input → Short Output",
+        "1000→8000": "Short Input → Long Output",
+        "8000→1000": "Long Input → Short Output",
+    }
+    desc = scenario_desc.get(scenario, scenario)
+
+    lines = [
+        f"### Scenario: {scenario} ({desc})",
+        "",
+        "| Concurrency | Backend    | QPS   | Throughput (tok/s) | TTFT (ms) | P99 TTFT (ms) | ITL (ms) | P99 ITL (ms) |",
+        "|-------------|------------|-------|-------------------|-----------|---------------|----------|--------------|",
+    ]
+
+    # Sort by concurrency first, then by backend
+    backend_order = ["triton", "fa3", "flashmla", "flashinfer"]
+    concurrency_order = [10, 80, 320]
+
+    sorted_results = sorted(
+        results,
+        key=lambda r: (
+            concurrency_order.index(r.concurrency) if r.concurrency in concurrency_order else 99,
+            backend_order.index(r.tag) if r.tag in backend_order else 99,
+        ),
+    )
+
+    current_concurrency = None
+    for r in sorted_results:
+        conc_display = str(r.concurrency) if r.concurrency != current_concurrency else ""
+        current_concurrency = r.concurrency
+
+        lines.append(
+            f"| {conc_display:11} | {r.tag:10} | "
+            f"{r.request_throughput:5.2f} | "
+            f"{r.output_throughput:17,.1f} | "
+            f"{r.mean_ttft_ms:9.1f} | "
+            f"{r.p99_ttft_ms:13.1f} | "
+            f"{r.mean_itl_ms:8.2f} | "
+            f"{r.p99_itl_ms:12.2f} |"
+        )
+
+    lines.append("")
+    return "\n".join(lines)
+
+
+def generate_key_findings(results: list[BenchmarkResult]) -> str:
+    """Generate key findings section."""
+    lines = [
+        "## Key Findings",
+        "",
+    ]
+
+    # 1. Highest overall throughput
+    best_overall = max(results, key=lambda r: r.output_throughput)
+    lines.append(
+        f"1. **Highest Throughput**: {best_overall.tag} achieves "
+        f"{best_overall.output_throughput:,.1f} tok/s on "
+        f"{best_overall.input_len}→{best_overall.output_len} @ concurrency {best_overall.concurrency}"
+    )
+
+    # 2. Lowest TTFT
+    best_ttft = min(results, key=lambda r: r.mean_ttft_ms)
+    lines.append(
+        f"2. **Lowest TTFT**: {best_ttft.tag} achieves "
+        f"{best_ttft.mean_ttft_ms:.1f}ms on "
+        f"{best_ttft.input_len}→{best_ttft.output_len} @ concurrency {best_ttft.concurrency}"
+    )
+
+    # 3. Lowest ITL
+    best_itl = min(results, key=lambda r: r.mean_itl_ms)
+    lines.append(
+        f"3. **Lowest ITL**: {best_itl.tag} achieves "
+        f"{best_itl.mean_itl_ms:.2f}ms on "
+        f"{best_itl.input_len}→{best_itl.output_len} @ concurrency {best_itl.concurrency}"
+    )
+
+    # 4. Concurrency scaling analysis
+    lines.append("")
+    lines.append("### Concurrency Scaling (1000→8000 scenario)")
+    lines.append("")
+    lines.append("| Backend | 10 conc | 80 conc | 320 conc | Scale Factor |")
+    lines.append("|---------|---------|---------|----------|--------------|")
+
+    scenario_results = [r for r in results if r.input_len == 1000 and r.output_len == 8000]
+    backend_perf = defaultdict(dict)
+    for r in scenario_results:
+        backend_perf[r.tag][r.concurrency] = r.output_throughput
+
+    for backend in ["triton", "fa3", "flashmla", "flashinfer"]:
+        if backend not in backend_perf:
+            continue
+        perf = backend_perf[backend]
+        if 10 in perf and 320 in perf:
+            scale = perf[320] / perf[10] if perf[10] > 0 else 0
+            lines.append(
+                f"| {backend:8} | {perf.get(10, 0):7,.1f} | "
+                f"{perf.get(80, 0):7,.1f} | {perf.get(320, 0):8,.1f} | {scale:12.1f}x |"
+            )
+
+    lines.append("")
+    return "\n".join(lines)
+
+
+def generate_report(data_dir: Path) -> str:
+    """Generate the full benchmark report."""
+    results = load_benchmark_data(data_dir)
+
+    if not results:
+        return "# Error\n\nNo benchmark data found."
+
+    scenarios = group_by_scenario(results)
+
+    report = [
+        "# GLM-4.7-Flash Attention Backend Performance Report",
+        "",
+        f"Generated from {len(results)} benchmark measurements.",
+        "",
+        "**Backends compared**: triton, fa3, flashmla, flashinfer",
+        "",
+        "**Scenarios**:",
+        "- 1000→1000: Short input, short output",
+        "- 1000→8000: Short input, long output",
+        "- 8000→1000: Long input, short output",
+        "",
+        "**Concurrency levels**: 10, 80, 320",
+        "",
+        "---",
+        "",
+        generate_executive_summary(scenarios),
+        "---",
+        "",
+        "## Detailed Results by Scenario",
+        "",
+    ]
+
+    # Generate tables in consistent order
+    for scenario in ["1000→1000", "1000→8000", "8000→1000"]:
+        if scenario in scenarios:
+            report.append(generate_scenario_table(scenario, scenarios[scenario]))
+
+    report.append("---")
+    report.append("")
+    report.append(generate_key_findings(results))
+
+    return "\n".join(report)
+
+
+def main():
+    script_dir = Path(__file__).parent
+    report = generate_report(script_dir)
+
+    output_path = script_dir / "BENCHMARK_REPORT.md"
+    with open(output_path, "w") as f:
+        f.write(report)
+
+    print(f"Report generated: {output_path}")
+    print(report)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/last_bench/sglang-oai_0123_10_1000_1000.jsonl b/last_bench/sglang-oai_0123_10_1000_1000.jsonl
new file mode 100644
index 000000000..dca82481a
--- /dev/null
+++ b/last_bench/sglang-oai_0123_10_1000_1000.jsonl
@@ -0,0 +1,7 @@
+{"tag": "triton", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 1, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": null, "duration": 41.56710090395063, "completed": 10, "total_input_tokens": 6101, "total_input_text_tokens": 6101, "total_input_vision_tokens": 0, "total_output_tokens": 4220, "total_output_tokens_retokenized": 4219, "request_throughput": 0.24057487249608928, "input_throughput": 146.77472970986406, "output_throughput": 101.52259619334967, "total_throughput": 248.29732590321373, "mean_e2e_latency_ms": 4141.624511638656, "median_e2e_latency_ms": 3203.8242494454607, "std_e2e_latency_ms": 2485.9917727216202, "p90_e2e_latency_ms": 7356.209329259581, "p99_e2e_latency_ms": 9327.435238955077, "mean_ttft_ms": 269.6008140454069, "median_ttft_ms": 96.12666291650385, "std_ttft_ms": 370.8697777080779, "p99_ttft_ms": 1260.9661481156945, "mean_tpot_ms": 9.034784566756835, "median_tpot_ms": 9.118396484478719, "std_tpot_ms": 0.4145035968282636, "p99_tpot_ms": 9.59984958153813, "mean_itl_ms": 9.197189547221072, "median_itl_ms": 9.218816994689405, "std_itl_ms": 0.49230433913096455, "p95_itl_ms": 9.893044945783913, "p99_itl_ms": 10.180208240635693, "concurrency": 0.9963707888141476, "accept_length": null, "max_output_tokens_per_s": 120.0, "max_concurrent_requests": 2}
+{"tag": "fa3", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 1, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": null, "duration": 38.03990558185615, "completed": 10, "total_input_tokens": 6101, "total_input_text_tokens": 6101, "total_input_vision_tokens": 0, "total_output_tokens": 4220, "total_output_tokens_retokenized": 4218, "request_throughput": 0.2628818302001698, "input_throughput": 160.3842046051236, "output_throughput": 110.93613234447167, "total_throughput": 271.3203369495953, "mean_e2e_latency_ms": 3789.365150523372, "median_e2e_latency_ms": 3030.7643914129585, "std_e2e_latency_ms": 2347.649909442257, "p90_e2e_latency_ms": 6861.860890546814, "p99_e2e_latency_ms": 8518.549765171483, "mean_ttft_ms": 105.94553446862847, "median_ttft_ms": 94.56806734669954, "std_ttft_ms": 27.94510373820782, "p99_ttft_ms": 176.04060200043023, "mean_tpot_ms": 8.735117569860298, "median_tpot_ms": 8.751374765776054, "std_tpot_ms": 0.04156977680352886, "p99_tpot_ms": 8.76773780092932, "mean_itl_ms": 8.749201726566584, "median_itl_ms": 8.751522051170468, "std_itl_ms": 0.14378221568601485, "p95_itl_ms": 8.877559565007687, "p99_itl_ms": 8.959061477798969, "concurrency": 0.9961552460663261, "accept_length": null, "max_output_tokens_per_s": 115.0, "max_concurrent_requests": 2}
+{"tag": "flashmla", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 1, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": null, "duration": 38.40182148804888, "completed": 10, "total_input_tokens": 6101, "total_input_text_tokens": 6101, "total_input_vision_tokens": 0, "total_output_tokens": 4220, "total_output_tokens_retokenized": 4215, "request_throughput": 0.26040431449617885, "input_throughput": 158.8726722741187, "output_throughput": 109.89062071738748, "total_throughput": 268.7632929915062, "mean_e2e_latency_ms": 3825.576449208893, "median_e2e_latency_ms": 3036.6330899996683, "std_e2e_latency_ms": 2415.8852833995174, "p90_e2e_latency_ms": 6976.653711660765, "p99_e2e_latency_ms": 8777.255602276418, "mean_ttft_ms": 112.22376921214163, "median_ttft_ms": 94.2523704143241, "std_ttft_ms": 44.1469574516126, "p99_ttft_ms": 221.72820456326008, "mean_tpot_ms": 8.747894237986943, "median_tpot_ms": 8.784269532090837, "std_tpot_ms": 0.1802130309851715, "p99_tpot_ms": 8.992865002797622, "mean_itl_ms": 8.820299398443815, "median_itl_ms": 8.825775003060699, "std_itl_ms": 0.22406159202597428, "p95_itl_ms": 9.14960989030078, "p99_itl_ms": 9.246604079380631, "concurrency": 0.9961966128089678, "accept_length": null, "max_output_tokens_per_s": 118.0, "max_concurrent_requests": 2}
+{"tag": "flashinfer", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 1, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": null, "duration": 35.988416022853926, "completed": 10, "total_input_tokens": 6101, "total_input_text_tokens": 6101, "total_input_vision_tokens": 0, "total_output_tokens": 4220, "total_output_tokens_retokenized": 3918, "request_throughput": 0.27786718908800107, "input_throughput": 169.52677206258946, "output_throughput": 117.25995379513645, "total_throughput": 286.7867258577259, "mean_e2e_latency_ms": 3584.2942386865616, "median_e2e_latency_ms": 2880.021173041314, "std_e2e_latency_ms": 2210.4487376161856, "p90_e2e_latency_ms": 6500.186281767674, "p99_e2e_latency_ms": 8042.241794501898, "mean_ttft_ms": 113.1668952992186, "median_ttft_ms": 102.46228356845677, "std_ttft_ms": 23.684222136567993, "p99_ttft_ms": 164.71267985878512, "mean_tpot_ms": 8.226941944769623, "median_tpot_ms": 8.265041945772195, "std_tpot_ms": 0.08986236974100431, "p99_tpot_ms": 8.326550089040984, "mean_itl_ms": 8.250821643859407, "median_itl_ms": 8.252424886450171, "std_itl_ms": 0.3354421269852178, "p95_itl_ms": 8.401609491556883, "p99_itl_ms": 8.499805759638548, "concurrency": 0.9959577649681517, "accept_length": null, "max_output_tokens_per_s": 123.0, "max_concurrent_requests": 2}
+{"tag": "lightllm-fa3", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 1, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": null, "duration": 28.220483848825097, "completed": 10, "total_input_tokens": 6101, "total_input_text_tokens": 6101, "total_input_vision_tokens": 0, "total_output_tokens": 4220, "total_output_tokens_retokenized": 4219, "request_throughput": 0.35435253532750216, "input_throughput": 216.19048180330907, "output_throughput": 149.5367699082059, "total_throughput": 365.727251711515, "mean_e2e_latency_ms": 2821.2477078894153, "median_e2e_latency_ms": 2261.518812039867, "std_e2e_latency_ms": 1752.5711889017846, "p90_e2e_latency_ms": 5119.068665453233, "p99_e2e_latency_ms": 6318.86261240812, "mean_ttft_ms": 60.56937051471323, "median_ttft_ms": 61.048407456837595, "std_ttft_ms": 10.235625045106131, "p99_ttft_ms": 72.07421808270738, "mean_tpot_ms": 6.549284494990456, "median_tpot_ms": 6.55624221613761, "std_tpot_ms": 0.026076747154878004, "p99_tpot_ms": 6.5676378215672475, "mean_itl_ms": 6.558974201037124, "median_itl_ms": 6.219592876732349, "std_itl_ms": 0.8434367069907109, "p95_itl_ms": 8.309309324249625, "p99_itl_ms": 8.402309883385898, "concurrency": 0.9997162780775186, "accept_length": null, "max_output_tokens_per_s": 153.0, "max_concurrent_requests": 2}
+{"tag": "lightllm-flashinfer", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 1, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": null, "duration": 28.21518262499012, "completed": 10, "total_input_tokens": 6101, "total_input_text_tokens": 6101, "total_input_vision_tokens": 0, "total_output_tokens": 4220, "total_output_tokens_retokenized": 4220, "request_throughput": 0.3544191130325353, "input_throughput": 216.2311008611498, "output_throughput": 149.5648656997299, "total_throughput": 365.7959665608797, "mean_e2e_latency_ms": 2820.835434575565, "median_e2e_latency_ms": 2276.5965425642207, "std_e2e_latency_ms": 1759.2921206087265, "p90_e2e_latency_ms": 5145.300170383415, "p99_e2e_latency_ms": 6369.066203290132, "mean_ttft_ms": 58.56175431981683, "median_ttft_ms": 52.00532451272011, "std_ttft_ms": 10.69389116363121, "p99_ttft_ms": 72.72814616095275, "mean_tpot_ms": 6.5418024206640935, "median_tpot_ms": 6.5778018938299265, "std_tpot_ms": 0.10667361038828493, "p99_tpot_ms": 6.681071237688413, "mean_itl_ms": 6.562763047186636, "median_itl_ms": 6.223411066457629, "std_itl_ms": 0.8022116495868372, "p95_itl_ms": 8.315606787800789, "p99_itl_ms": 8.406318174675107, "concurrency": 0.9997579927330181, "accept_length": null, "max_output_tokens_per_s": 158.0, "max_concurrent_requests": 2}
+{"tag": "lightllm-triton", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 1, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": null, "duration": 28.820174572058022, "completed": 10, "total_input_tokens": 6101, "total_input_text_tokens": 6101, "total_input_vision_tokens": 0, "total_output_tokens": 4220, "total_output_tokens_retokenized": 4220, "request_throughput": 0.3469791612468331, "input_throughput": 211.6919862766929, "output_throughput": 146.42520604616357, "total_throughput": 358.11719232285645, "mean_e2e_latency_ms": 2881.241054646671, "median_e2e_latency_ms": 2301.2861979659647, "std_e2e_latency_ms": 1878.5403060746103, "p90_e2e_latency_ms": 5358.587349089794, "p99_e2e_latency_ms": 6771.017882206944, "mean_ttft_ms": 54.279814031906426, "median_ttft_ms": 50.7953364867717, "std_ttft_ms": 8.100198475876049, "p99_ttft_ms": 70.34216834232211, "mean_tpot_ms": 6.580841954172073, "median_tpot_ms": 6.67901388540887, "std_tpot_ms": 0.350263682853537, "p99_tpot_ms": 7.056966771200029, "mean_itl_ms": 6.716453389913626, "median_itl_ms": 6.224676966667175, "std_itl_ms": 0.9832705612541962, "p95_itl_ms": 8.326812135055661, "p99_itl_ms": 8.418112937361002, "concurrency": 0.9997306044912427, "accept_length": null, "max_output_tokens_per_s": 165.0, "max_concurrent_requests": 2}
diff --git a/last_bench/sglang-oai_0123_10_1000_8000.jsonl b/last_bench/sglang-oai_0123_10_1000_8000.jsonl
new file mode 100644
index 000000000..b6c1a8b66
--- /dev/null
+++ b/last_bench/sglang-oai_0123_10_1000_8000.jsonl
@@ -0,0 +1,7 @@
+{"tag": "triton", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 1, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 8000, "random_range_ratio": 0.0, "server_info": {"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30000, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": null, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 68555837, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "triton", "decode_attention_backend": null, "prefill_attention_backend": null, "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "status": "ready", "max_total_num_tokens": 1386225, "max_req_input_len": 202746, "internal_states": [{"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30000, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": 3500, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 68555837, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "triton", "decode_attention_backend": "triton", "prefill_attention_backend": "triton", "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": true, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "use_mla_backend": true, "last_gen_throughput": 68.4354602959235, "memory_usage": {"weight": 56.33, "kvcache": 69.9, "token_capacity": 1386225, "graph": 1.69}, "effective_max_running_requests_per_dp": 3500}], "version": "0.0.0.dev1+g1e8e0cca2"}, "duration": 528.4287535739131, "completed": 10, "total_input_tokens": 6101, "total_input_text_tokens": 6101, "total_input_vision_tokens": 0, "total_output_tokens": 44462, "total_output_tokens_retokenized": 44449, "request_throughput": 0.01892402699960434, "input_throughput": 11.545548872458607, "output_throughput": 84.1400088456408, "total_throughput": 95.68555771809942, "mean_e2e_latency_ms": 52840.63192738686, "median_e2e_latency_ms": 54310.54771656636, "std_e2e_latency_ms": 32405.91208850261, "p90_e2e_latency_ms": 95479.23450903034, "p99_e2e_latency_ms": 100470.4938506335, "mean_ttft_ms": 143.15742638427764, "median_ttft_ms": 105.27272755280137, "std_ttft_ms": 89.88346432853983, "p99_ttft_ms": 384.79625445324933, "mean_tpot_ms": 11.041872593796489, "median_tpot_ms": 11.331380416371044, "std_tpot_ms": 1.5086878772657037, "p99_tpot_ms": 12.987873410365033, "mean_itl_ms": 11.857849385685803, "median_itl_ms": 11.53362705372274, "std_itl_ms": 2.106867488894714, "p95_itl_ms": 15.640562167391181, "p99_itl_ms": 16.497120447456837, "concurrency": 0.999957545270024, "accept_length": null, "max_output_tokens_per_s": 121.0, "max_concurrent_requests": 2}
+{"tag": "fa3", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 1, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 8000, "random_range_ratio": 0.0, "server_info": {"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30001, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": null, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 1062566034, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "fa3", "decode_attention_backend": null, "prefill_attention_backend": null, "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "status": "ready", "max_total_num_tokens": 1386225, "max_req_input_len": 202746, "internal_states": [{"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30001, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": 3500, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 1062566034, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "fa3", "decode_attention_backend": "fa3", "prefill_attention_backend": "fa3", "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "use_mla_backend": true, "last_gen_throughput": 111.72392428829242, "memory_usage": {"weight": 56.33, "kvcache": 69.9, "token_capacity": 1386225, "graph": 2.36}, "effective_max_running_requests_per_dp": 3500}], "version": "0.0.0.dev1+g1e8e0cca2"}, "duration": 393.95918229292147, "completed": 10, "total_input_tokens": 6101, "total_input_text_tokens": 6101, "total_input_vision_tokens": 0, "total_output_tokens": 44462, "total_output_tokens_retokenized": 44449, "request_throughput": 0.025383340329315322, "input_throughput": 15.486375934915278, "output_throughput": 112.85940777220178, "total_throughput": 128.34578370711705, "mean_e2e_latency_ms": 39394.25162828993, "median_e2e_latency_ms": 42470.40231700521, "std_e2e_latency_ms": 21636.962726875605, "p90_e2e_latency_ms": 66409.18134008534, "p99_e2e_latency_ms": 68663.32626647316, "mean_ttft_ms": 111.9626430561766, "median_ttft_ms": 101.87083506025374, "std_ttft_ms": 17.221441402427743, "p99_ttft_ms": 138.93227798631418, "mean_tpot_ms": 8.802688235486457, "median_tpot_ms": 8.818598380117141, "std_tpot_ms": 0.06861274446224537, "p99_tpot_ms": 8.872189325777665, "mean_itl_ms": 8.83899862207603, "median_itl_ms": 8.827421930618584, "std_itl_ms": 0.18904639497524522, "p95_itl_ms": 9.004280308727175, "p99_itl_ms": 9.104600874707103, "concurrency": 0.9999576960995675, "accept_length": null, "max_output_tokens_per_s": 115.0, "max_concurrent_requests": 2}
+{"tag": "flashmla", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 1, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 8000, "random_range_ratio": 0.0, "server_info": {"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30002, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 64, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": null, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 1011399013, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "flashmla", "decode_attention_backend": null, "prefill_attention_backend": null, "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "status": "ready", "max_total_num_tokens": 1386176, "max_req_input_len": 202746, "internal_states": [{"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30002, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 64, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": 3500, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 1011399013, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "flashmla", "decode_attention_backend": "flashmla", "prefill_attention_backend": "flashmla", "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "use_mla_backend": true, "last_gen_throughput": 90.13693125010776, "memory_usage": {"weight": 56.33, "kvcache": 69.9, "token_capacity": 1386176, "graph": 1.21}, "effective_max_running_requests_per_dp": 3500}], "version": "0.0.0.dev1+g1e8e0cca2"}, "duration": 443.0159071299713, "completed": 10, "total_input_tokens": 6101, "total_input_text_tokens": 6101, "total_input_vision_tokens": 0, "total_output_tokens": 44462, "total_output_tokens_retokenized": 44439, "request_throughput": 0.022572552901731845, "input_throughput": 13.7715145253466, "output_throughput": 100.36208471168014, "total_throughput": 114.13359923702673, "mean_e2e_latency_ms": 44299.69251912553, "median_e2e_latency_ms": 46765.18414099701, "std_e2e_latency_ms": 25605.205296934604, "p90_e2e_latency_ms": 77119.38225780614, "p99_e2e_latency_ms": 80277.98145196866, "mean_ttft_ms": 164.6503965370357, "median_ttft_ms": 98.8025760743767, "std_ttft_ms": 184.20579272535184, "p99_ttft_ms": 664.3669738760218, "mean_tpot_ms": 9.603287608095876, "median_tpot_ms": 9.73514685671584, "std_tpot_ms": 0.6061427340384441, "p99_tpot_ms": 10.30806361595789, "mean_itl_ms": 9.931152820655184, "median_itl_ms": 9.84680699184537, "std_itl_ms": 0.8601496753226304, "p95_itl_ms": 11.435698019340634, "p99_itl_ms": 11.772641446441412, "concurrency": 0.9999571529184155, "accept_length": null, "max_output_tokens_per_s": 118.0, "max_concurrent_requests": 2}
+{"tag": "flashinfer", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 1, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 8000, "random_range_ratio": 0.0, "server_info": {"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30003, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": null, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 493872944, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "flashinfer", "decode_attention_backend": null, "prefill_attention_backend": null, "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "status": "ready", "max_total_num_tokens": 1386222, "max_req_input_len": 202746, "internal_states": [{"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30003, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": 3500, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 493872944, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "flashinfer", "decode_attention_backend": "flashinfer", "prefill_attention_backend": "flashinfer", "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "use_mla_backend": true, "last_gen_throughput": 116.78249468138866, "memory_usage": {"weight": 56.33, "kvcache": 69.9, "token_capacity": 1386222, "graph": 1.71}, "effective_max_running_requests_per_dp": 3500}], "version": "0.0.0.dev1+g1e8e0cca2"}, "duration": 370.64318000199273, "completed": 10, "total_input_tokens": 6101, "total_input_text_tokens": 6101, "total_input_vision_tokens": 0, "total_output_tokens": 44462, "total_output_tokens_retokenized": 41404, "request_throughput": 0.026980126816163828, "input_throughput": 16.46057537054155, "output_throughput": 119.95903985002761, "total_throughput": 136.41961522056914, "mean_e2e_latency_ms": 37062.73019104265, "median_e2e_latency_ms": 39886.838593520224, "std_e2e_latency_ms": 20455.257739139928, "p90_e2e_latency_ms": 62683.871512743644, "p99_e2e_latency_ms": 64792.23027887987, "mean_ttft_ms": 122.99183551222086, "median_ttft_ms": 100.1137139974162, "std_ttft_ms": 45.0379113541844, "p99_ttft_ms": 223.7851848336868, "mean_tpot_ms": 8.248293524737983, "median_tpot_ms": 8.286151737614464, "std_tpot_ms": 0.12572598005984015, "p99_tpot_ms": 8.366339839761116, "mean_itl_ms": 8.340799370035327, "median_itl_ms": 8.37263313587755, "std_itl_ms": 2.689567428376299, "p95_itl_ms": 8.641683915629983, "p99_itl_ms": 8.737169874366373, "concurrency": 0.9999571607075943, "accept_length": null, "max_output_tokens_per_s": 131.0, "max_concurrent_requests": 2}
+{"tag": "lightllm-fa3", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 1, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 8000, "random_range_ratio": 0.0, "server_info": null, "duration": 296.87213805201463, "completed": 10, "total_input_tokens": 6101, "total_input_text_tokens": 6101, "total_input_vision_tokens": 0, "total_output_tokens": 44462, "total_output_tokens_retokenized": 44460, "request_throughput": 0.03368453525351682, "input_throughput": 20.550934958170615, "output_throughput": 149.7681806441865, "total_throughput": 170.3191156023571, "mean_e2e_latency_ms": 29686.422172025777, "median_e2e_latency_ms": 31977.473180973902, "std_e2e_latency_ms": 16341.231223519195, "p90_e2e_latency_ms": 50013.6461708229, "p99_e2e_latency_ms": 52065.068823597394, "mean_ttft_ms": 206.2621021643281, "median_ttft_ms": 163.88948948588222, "std_ttft_ms": 148.8543077347853, "p99_ttft_ms": 593.3920549717733, "mean_tpot_ms": 6.604791340804418, "median_tpot_ms": 6.613572524991938, "std_tpot_ms": 0.05190789972059481, "p99_tpot_ms": 6.664961845437845, "mean_itl_ms": 6.637431996100034, "median_itl_ms": 6.223700940608978, "std_itl_ms": 0.8980564150345929, "p95_itl_ms": 8.328629587776959, "p99_itl_ms": 8.446590006351471, "concurrency": 0.9999733342043857, "accept_length": null, "max_output_tokens_per_s": 153.0, "max_concurrent_requests": 2}
+{"tag": "lightllm-flashinfer", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 1, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 8000, "random_range_ratio": 0.0, "server_info": null, "duration": 300.56394808203913, "completed": 10, "total_input_tokens": 6101, "total_input_text_tokens": 6101, "total_input_vision_tokens": 0, "total_output_tokens": 44462, "total_output_tokens_retokenized": 44459, "request_throughput": 0.033270790005960706, "input_throughput": 20.298508982636626, "output_throughput": 147.9285865245025, "total_throughput": 168.22709550713913, "mean_e2e_latency_ms": 30055.323450965807, "median_e2e_latency_ms": 32303.914289921522, "std_e2e_latency_ms": 16614.672857992904, "p90_e2e_latency_ms": 50797.29593801312, "p99_e2e_latency_ms": 52831.220259703696, "mean_ttft_ms": 153.99227743037045, "median_ttft_ms": 154.41859746351838, "std_ttft_ms": 40.539656892122096, "p99_ttft_ms": 230.53783738985658, "mean_tpot_ms": 6.687357886347274, "median_tpot_ms": 6.697858736484612, "std_tpot_ms": 0.0747425716300049, "p99_tpot_ms": 6.810022278762517, "mean_itl_ms": 6.7319551750847815, "median_itl_ms": 6.230928935110569, "std_itl_ms": 1.0190941574897405, "p95_itl_ms": 8.343739341944456, "p99_itl_ms": 8.449985329061745, "concurrency": 0.9999643550983096, "accept_length": null, "max_output_tokens_per_s": 157.0, "max_concurrent_requests": 2}
+{"tag": "lightllm-triton", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 1, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 8000, "random_range_ratio": 0.0, "server_info": null, "duration": 402.82201804290526, "completed": 10, "total_input_tokens": 6101, "total_input_text_tokens": 6101, "total_input_vision_tokens": 0, "total_output_tokens": 44462, "total_output_tokens_retokenized": 44462, "request_throughput": 0.024824859496470928, "input_throughput": 15.145646778796912, "output_throughput": 110.37629029320904, "total_throughput": 125.52193707200595, "mean_e2e_latency_ms": 40281.3204290811, "median_e2e_latency_ms": 40955.73136943858, "std_e2e_latency_ms": 25145.381849844358, "p90_e2e_latency_ms": 73342.79716589954, "p99_e2e_latency_ms": 78086.60535515985, "mean_ttft_ms": 200.0209585763514, "median_ttft_ms": 171.22362996451557, "std_ttft_ms": 140.57473101890918, "p99_ttft_ms": 571.9467777409592, "mean_tpot_ms": 8.330440810182232, "median_tpot_ms": 8.543196817424905, "std_tpot_ms": 1.272966997457874, "p99_tpot_ms": 10.03476722604763, "mean_itl_ms": 9.024065831756937, "median_itl_ms": 8.3220349624753, "std_itl_ms": 2.06384829597527, "p95_itl_ms": 12.451428978238255, "p99_itl_ms": 14.448113285470754, "concurrency": 0.9999781199842622, "accept_length": null, "max_output_tokens_per_s": 163.0, "max_concurrent_requests": 2}
diff --git a/last_bench/sglang-oai_0123_10_8000_1000.jsonl b/last_bench/sglang-oai_0123_10_8000_1000.jsonl
new file mode 100644
index 000000000..ab41238d5
--- /dev/null
+++ b/last_bench/sglang-oai_0123_10_8000_1000.jsonl
@@ -0,0 +1,7 @@
+{"tag": "triton", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 1, "sharegpt_output_len": null, "random_input_len": 8000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": {"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30000, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": null, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 604505033, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "triton", "decode_attention_backend": null, "prefill_attention_backend": null, "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "status": "ready", "max_total_num_tokens": 1386224, "max_req_input_len": 202746, "internal_states": [{"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30000, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": 3500, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 604505033, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "triton", "decode_attention_backend": "triton", "prefill_attention_backend": "triton", "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": true, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "use_mla_backend": true, "last_gen_throughput": 73.51944093316781, "memory_usage": {"weight": 56.33, "kvcache": 69.9, "token_capacity": 1386224, "graph": 1.69}, "effective_max_running_requests_per_dp": 3500}], "version": "0.0.0.dev1+g1e8e0cca2"}, "duration": 57.25094021507539, "completed": 10, "total_input_tokens": 41941, "total_input_text_tokens": 41941, "total_input_vision_tokens": 0, "total_output_tokens": 4220, "total_output_tokens_retokenized": 4220, "request_throughput": 0.1746696204889014, "input_throughput": 732.5818552925012, "output_throughput": 73.71057984631638, "total_throughput": 806.2924351388176, "mean_e2e_latency_ms": 5722.978232218884, "median_e2e_latency_ms": 4641.29843947012, "std_e2e_latency_ms": 3921.2995928261576, "p90_e2e_latency_ms": 12518.891114904545, "p99_e2e_latency_ms": 12771.664290658664, "mean_ttft_ms": 217.65351290814579, "median_ttft_ms": 177.99088754691184, "std_ttft_ms": 112.65007093616718, "p99_ttft_ms": 425.97797273192555, "mean_tpot_ms": 12.595701176155286, "median_tpot_ms": 13.11032631924969, "std_tpot_ms": 2.5773326227353888, "p99_tpot_ms": 16.479375187077174, "mean_itl_ms": 13.076763751810404, "median_itl_ms": 13.160730595700443, "std_itl_ms": 2.441664585133937, "p95_itl_ms": 16.710556542966515, "p99_itl_ms": 16.849553296342492, "concurrency": 0.9996304358879161, "accept_length": null, "max_output_tokens_per_s": 110.0, "max_concurrent_requests": 2}
+{"tag": "fa3", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 1, "sharegpt_output_len": null, "random_input_len": 8000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": {"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30001, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": null, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 398782077, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "fa3", "decode_attention_backend": null, "prefill_attention_backend": null, "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "status": "ready", "max_total_num_tokens": 1386225, "max_req_input_len": 202746, "internal_states": [{"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30001, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": 3500, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 398782077, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "fa3", "decode_attention_backend": "fa3", "prefill_attention_backend": "fa3", "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "use_mla_backend": true, "last_gen_throughput": 112.54967193050531, "memory_usage": {"weight": 56.33, "kvcache": 69.9, "token_capacity": 1386225, "graph": 2.36}, "effective_max_running_requests_per_dp": 3500}], "version": "0.0.0.dev1+g1e8e0cca2"}, "duration": 39.43922132183798, "completed": 10, "total_input_tokens": 41941, "total_input_text_tokens": 41941, "total_input_vision_tokens": 0, "total_output_tokens": 4220, "total_output_tokens_retokenized": 4220, "request_throughput": 0.25355470176240213, "input_throughput": 1063.433774661691, "output_throughput": 107.0000841437337, "total_throughput": 1170.4338588054245, "mean_e2e_latency_ms": 3942.344288667664, "median_e2e_latency_ms": 3168.642749893479, "std_e2e_latency_ms": 2358.272403480565, "p90_e2e_latency_ms": 7263.652879674918, "p99_e2e_latency_ms": 8585.422784292605, "mean_ttft_ms": 208.0219859490171, "median_ttft_ms": 162.48971258755773, "std_ttft_ms": 122.61083935112063, "p99_ttft_ms": 464.63888159254566, "mean_tpot_ms": 8.857586609576051, "median_tpot_ms": 8.86137127263336, "std_tpot_ms": 0.07794326844451306, "p99_tpot_ms": 8.973473299517387, "mean_itl_ms": 8.870108765793297, "median_itl_ms": 8.866838878020644, "std_itl_ms": 0.13255333227863098, "p95_itl_ms": 9.031242376659065, "p99_itl_ms": 9.132659959141165, "concurrency": 0.999599930357839, "accept_length": null, "max_output_tokens_per_s": 115.0, "max_concurrent_requests": 2}
+{"tag": "flashmla", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 1, "sharegpt_output_len": null, "random_input_len": 8000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": {"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30002, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 64, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": null, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 379113072, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "flashmla", "decode_attention_backend": null, "prefill_attention_backend": null, "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "status": "ready", "max_total_num_tokens": 1386176, "max_req_input_len": 202746, "internal_states": [{"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30002, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 64, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": 3500, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 379113072, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "flashmla", "decode_attention_backend": "flashmla", "prefill_attention_backend": "flashmla", "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "use_mla_backend": true, "last_gen_throughput": 94.37640848866731, "memory_usage": {"weight": 56.33, "kvcache": 69.9, "token_capacity": 1386176, "graph": 1.21}, "effective_max_running_requests_per_dp": 3500}], "version": "0.0.0.dev1+g1e8e0cca2"}, "duration": 44.99840556993149, "completed": 10, "total_input_tokens": 41941, "total_input_text_tokens": 41941, "total_input_vision_tokens": 0, "total_output_tokens": 4220, "total_output_tokens_retokenized": 4220, "request_throughput": 0.2222300962299457, "input_throughput": 932.0552465980153, "output_throughput": 93.78110060903708, "total_throughput": 1025.8363472070523, "mean_e2e_latency_ms": 4497.922634705901, "median_e2e_latency_ms": 3725.210659438744, "std_e2e_latency_ms": 2773.6783957420403, "p90_e2e_latency_ms": 8437.325873808004, "p99_e2e_latency_ms": 9939.448532455135, "mean_ttft_ms": 188.3448517182842, "median_ttft_ms": 149.44852597545832, "std_ttft_ms": 112.42184826547498, "p99_ttft_ms": 448.1769835739397, "mean_tpot_ms": 10.145170876880718, "median_tpot_ms": 10.421453350218435, "std_tpot_ms": 0.9104786801424741, "p99_tpot_ms": 11.68051373533182, "mean_itl_ms": 10.236510755759697, "median_itl_ms": 10.264639975503087, "std_itl_ms": 0.8240081868908503, "p95_itl_ms": 11.802727438043803, "p99_itl_ms": 11.90974765922874, "concurrency": 0.9995737799455433, "accept_length": null, "max_output_tokens_per_s": 113.0, "max_concurrent_requests": 2}
+{"tag": "flashinfer", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 1, "sharegpt_output_len": null, "random_input_len": 8000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": {"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30003, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": null, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 253908945, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "flashinfer", "decode_attention_backend": null, "prefill_attention_backend": null, "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "status": "ready", "max_total_num_tokens": 1386222, "max_req_input_len": 202746, "internal_states": [{"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30003, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": 3500, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 253908945, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "flashinfer", "decode_attention_backend": "flashinfer", "prefill_attention_backend": "flashinfer", "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "use_mla_backend": true, "last_gen_throughput": 117.92113979515487, "memory_usage": {"weight": 56.33, "kvcache": 69.9, "token_capacity": 1386222, "graph": 1.71}, "effective_max_running_requests_per_dp": 3500}], "version": "0.0.0.dev1+g1e8e0cca2"}, "duration": 37.378284689970315, "completed": 10, "total_input_tokens": 41941, "total_input_text_tokens": 41941, "total_input_vision_tokens": 0, "total_output_tokens": 4220, "total_output_tokens_retokenized": 4026, "request_throughput": 0.2675350161984103, "input_throughput": 1122.0686114377527, "output_throughput": 112.89977683572914, "total_throughput": 1234.9683882734819, "mean_e2e_latency_ms": 3736.1401027301326, "median_e2e_latency_ms": 3029.271812643856, "std_e2e_latency_ms": 2279.0599616437844, "p90_e2e_latency_ms": 6922.7269294671705, "p99_e2e_latency_ms": 8203.730608876795, "mean_ttft_ms": 169.47351093403995, "median_ttft_ms": 149.5549235260114, "std_ttft_ms": 73.39107755634313, "p99_ttft_ms": 270.47318189404905, "mean_tpot_ms": 8.438028617355302, "median_tpot_ms": 8.468548115720909, "std_tpot_ms": 0.17177658018927264, "p99_tpot_ms": 8.68933005085632, "mean_itl_ms": 8.471877963532142, "median_itl_ms": 8.466457016766071, "std_itl_ms": 0.2015365133171823, "p95_itl_ms": 8.739279105793685, "p99_itl_ms": 8.892325791530311, "concurrency": 0.9995483029034363, "accept_length": null, "max_output_tokens_per_s": 122.0, "max_concurrent_requests": 2}
+{"tag": "lightllm-fa3", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 1, "sharegpt_output_len": null, "random_input_len": 8000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": null, "duration": 29.579997348831967, "completed": 10, "total_input_tokens": 41941, "total_input_text_tokens": 41941, "total_input_vision_tokens": 0, "total_output_tokens": 4220, "total_output_tokens_retokenized": 4220, "request_throughput": 0.3380662912870367, "input_throughput": 1417.8838322869606, "output_throughput": 142.66397492312947, "total_throughput": 1560.54780721009, "mean_e2e_latency_ms": 2957.1972485166043, "median_e2e_latency_ms": 2444.4284136407077, "std_e2e_latency_ms": 1800.5337827383514, "p90_e2e_latency_ms": 5347.466037608682, "p99_e2e_latency_ms": 6513.965703966096, "mean_ttft_ms": 151.79783792700619, "median_ttft_ms": 153.29680847935379, "std_ttft_ms": 74.49831125817643, "p99_ttft_ms": 276.19091010885313, "mean_tpot_ms": 6.6520214002182465, "median_tpot_ms": 6.663941853427483, "std_tpot_ms": 0.06995932396737312, "p99_tpot_ms": 6.761680932473347, "mean_itl_ms": 6.665223839852749, "median_itl_ms": 6.226972909644246, "std_itl_ms": 0.9274401703373519, "p95_itl_ms": 8.330000611022115, "p99_itl_ms": 8.470633188262582, "concurrency": 0.9997287064102377, "accept_length": null, "max_output_tokens_per_s": 153.0, "max_concurrent_requests": 3}
+{"tag": "lightllm-flashinfer", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 1, "sharegpt_output_len": null, "random_input_len": 8000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": null, "duration": 29.990934344939888, "completed": 10, "total_input_tokens": 41941, "total_input_text_tokens": 41941, "total_input_vision_tokens": 0, "total_output_tokens": 4220, "total_output_tokens_retokenized": 4220, "request_throughput": 0.33343409328249934, "input_throughput": 1398.4559306361305, "output_throughput": 140.70918736521472, "total_throughput": 1539.1651180013453, "mean_e2e_latency_ms": 2998.289765859954, "median_e2e_latency_ms": 2464.879463543184, "std_e2e_latency_ms": 1852.130967789295, "p90_e2e_latency_ms": 5536.782492487691, "p99_e2e_latency_ms": 6639.743499069009, "mean_ttft_ms": 139.17544910218567, "median_ttft_ms": 142.81450887210667, "std_ttft_ms": 65.24136773664321, "p99_ttft_ms": 242.65094643458724, "mean_tpot_ms": 6.75879133261816, "median_tpot_ms": 6.808080749864214, "std_tpot_ms": 0.18544835486888658, "p99_tpot_ms": 7.013304534186441, "mean_itl_ms": 6.792839756158036, "median_itl_ms": 6.243136944249272, "std_itl_ms": 0.9610731556465486, "p95_itl_ms": 8.374860789626837, "p99_itl_ms": 8.49069862626493, "concurrency": 0.999732029477711, "accept_length": null, "max_output_tokens_per_s": 154.0, "max_concurrent_requests": 3}
+{"tag": "lightllm-triton", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 1, "sharegpt_output_len": null, "random_input_len": 8000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": null, "duration": 44.55748701398261, "completed": 10, "total_input_tokens": 41941, "total_input_text_tokens": 41941, "total_input_vision_tokens": 0, "total_output_tokens": 4220, "total_output_tokens_retokenized": 4220, "request_throughput": 0.22442917386390968, "input_throughput": 941.2783981026236, "output_throughput": 94.70911137056987, "total_throughput": 1035.9875094731933, "mean_e2e_latency_ms": 4454.932378418744, "median_e2e_latency_ms": 3474.5054844534025, "std_e2e_latency_ms": 3141.729941933886, "p90_e2e_latency_ms": 9705.666761891916, "p99_e2e_latency_ms": 10252.758897030726, "mean_ttft_ms": 167.52417001407593, "median_ttft_ms": 153.05893344338983, "std_ttft_ms": 96.79885770899398, "p99_ttft_ms": 366.1548413033597, "mean_tpot_ms": 9.747877234945388, "median_tpot_ms": 10.067643957439982, "std_tpot_ms": 2.2756355104449306, "p99_tpot_ms": 13.484334798556336, "mean_itl_ms": 10.186269659067515, "median_itl_ms": 10.326585033908486, "std_itl_ms": 2.367790916680719, "p95_itl_ms": 14.481087820604444, "p99_itl_ms": 14.626262942329049, "concurrency": 0.9998167933081008, "accept_length": null, "max_output_tokens_per_s": 148.0, "max_concurrent_requests": 2}
diff --git a/last_bench/sglang-oai_0123_320_1000_1000.jsonl b/last_bench/sglang-oai_0123_320_1000_1000.jsonl
new file mode 100644
index 000000000..1881fdfce
--- /dev/null
+++ b/last_bench/sglang-oai_0123_320_1000_1000.jsonl
@@ -0,0 +1,7 @@
+{"tag": "triton", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 64, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": null, "duration": 88.22740700491704, "completed": 320, "total_input_tokens": 158939, "total_input_text_tokens": 158939, "total_input_vision_tokens": 0, "total_output_tokens": 170134, "total_output_tokens_retokenized": 169083, "request_throughput": 3.626990873506754, "input_throughput": 1801.4696951384065, "output_throughput": 1928.3577039787442, "total_throughput": 3729.8273991171504, "mean_e2e_latency_ms": 15789.648024459893, "median_e2e_latency_ms": 16212.67285104841, "std_e2e_latency_ms": 8072.337017063825, "p90_e2e_latency_ms": 26709.808869380508, "p99_e2e_latency_ms": 31045.851741307415, "mean_ttft_ms": 144.5240720640868, "median_ttft_ms": 109.16357941459864, "std_ttft_ms": 61.89498059484625, "p99_ttft_ms": 260.22138732951134, "mean_tpot_ms": 29.608090827655733, "median_tpot_ms": 30.36084325541004, "std_tpot_ms": 2.7705613573890226, "p99_tpot_ms": 33.954402691723445, "mean_itl_ms": 29.485199626481318, "median_itl_ms": 24.739603977650404, "std_itl_ms": 17.225318489747373, "p95_itl_ms": 73.9279053057544, "p99_itl_ms": 98.22126995772122, "concurrency": 57.268909280599985, "accept_length": null, "max_output_tokens_per_s": 2495.0, "max_concurrent_requests": 71}
+{"tag": "fa3", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 64, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": null, "duration": 81.37246890296228, "completed": 320, "total_input_tokens": 158939, "total_input_text_tokens": 158939, "total_input_vision_tokens": 0, "total_output_tokens": 170134, "total_output_tokens_retokenized": 169225, "request_throughput": 3.9325339923212126, "input_throughput": 1953.2281881423164, "output_throughput": 2090.805432029929, "total_throughput": 4044.0336201722453, "mean_e2e_latency_ms": 14576.79473591852, "median_e2e_latency_ms": 14943.263828055933, "std_e2e_latency_ms": 7403.160837428686, "p90_e2e_latency_ms": 24588.386960700154, "p99_e2e_latency_ms": 28584.442388296593, "mean_ttft_ms": 189.66343759384472, "median_ttft_ms": 111.13140103407204, "std_ttft_ms": 154.90940140833771, "p99_ttft_ms": 498.54426054283977, "mean_tpot_ms": 27.2643124012368, "median_tpot_ms": 27.90378232804311, "std_tpot_ms": 2.606168417806973, "p99_tpot_ms": 31.501365364149077, "mean_itl_ms": 27.111262237968155, "median_itl_ms": 22.098568035289645, "std_itl_ms": 17.53854613363614, "p95_itl_ms": 73.02747792564332, "p99_itl_ms": 94.83545971103013, "concurrency": 57.323740798088494, "accept_length": null, "max_output_tokens_per_s": 2815.0, "max_concurrent_requests": 72}
+{"tag": "flashmla", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 64, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": null, "duration": 85.05533684790134, "completed": 320, "total_input_tokens": 158939, "total_input_text_tokens": 158939, "total_input_vision_tokens": 0, "total_output_tokens": 170134, "total_output_tokens_retokenized": 169005, "request_throughput": 3.7622565715333556, "input_throughput": 1868.6540538216875, "output_throughput": 2000.2742485664248, "total_throughput": 3868.928302388112, "mean_e2e_latency_ms": 15298.422675980692, "median_e2e_latency_ms": 15641.821417491883, "std_e2e_latency_ms": 7749.526213849034, "p90_e2e_latency_ms": 25955.183803243566, "p99_e2e_latency_ms": 30152.963120699864, "mean_ttft_ms": 269.53684461695957, "median_ttft_ms": 118.97248448804021, "std_ttft_ms": 275.2191731553012, "p99_ttft_ms": 867.960302028805, "mean_tpot_ms": 28.51670166651952, "median_tpot_ms": 29.272230424085432, "std_tpot_ms": 3.0552503112772613, "p99_tpot_ms": 33.589946233320624, "mean_itl_ms": 28.321577751000884, "median_itl_ms": 21.607821458019316, "std_itl_ms": 22.36865151668966, "p95_itl_ms": 95.35049657570198, "p99_itl_ms": 99.12627678597336, "concurrency": 57.55659124680326, "accept_length": null, "max_output_tokens_per_s": 2815.0, "max_concurrent_requests": 71}
+{"tag": "flashinfer", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 64, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": null, "duration": 76.08857471891679, "completed": 320, "total_input_tokens": 158939, "total_input_text_tokens": 158939, "total_input_vision_tokens": 0, "total_output_tokens": 170134, "total_output_tokens_retokenized": 154408, "request_throughput": 4.205624841602442, "input_throughput": 2088.868145935783, "output_throughput": 2235.9993025037184, "total_throughput": 4324.8674484395015, "mean_e2e_latency_ms": 13736.860047160008, "median_e2e_latency_ms": 14124.713889556006, "std_e2e_latency_ms": 7050.329882103861, "p90_e2e_latency_ms": 23447.995013161566, "p99_e2e_latency_ms": 27480.407421826385, "mean_ttft_ms": 162.28689183481038, "median_ttft_ms": 113.47088008187711, "std_ttft_ms": 78.54445987281336, "p99_ttft_ms": 314.5761683699675, "mean_tpot_ms": 25.760918661083565, "median_tpot_ms": 26.645654844490704, "std_tpot_ms": 3.0593642182725933, "p99_tpot_ms": 30.497802868581097, "mean_itl_ms": 25.969708663598038, "median_itl_ms": 19.10600601695478, "std_itl_ms": 49.776817453535145, "p95_itl_ms": 91.81465758010745, "p99_itl_ms": 96.63003749214113, "concurrency": 57.77207985995222, "accept_length": null, "max_output_tokens_per_s": 3077.0, "max_concurrent_requests": 71}
+{"tag": "lightllm-fa3", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 64, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": null, "duration": 63.755531140137464, "completed": 320, "total_input_tokens": 158939, "total_input_text_tokens": 158939, "total_input_vision_tokens": 0, "total_output_tokens": 170134, "total_output_tokens_retokenized": 169953, "request_throughput": 5.019172364772962, "input_throughput": 2492.944489014531, "output_throughput": 2668.537097213385, "total_throughput": 5161.481586227916, "mean_e2e_latency_ms": 11383.221806850634, "median_e2e_latency_ms": 11569.44180233404, "std_e2e_latency_ms": 5830.871063837923, "p90_e2e_latency_ms": 18649.839925765995, "p99_e2e_latency_ms": 23242.17696905136, "mean_ttft_ms": 158.7225136347115, "median_ttft_ms": 99.41273042932153, "std_ttft_ms": 128.79054902232136, "p99_ttft_ms": 430.53666956257075, "mean_tpot_ms": 21.295283829261166, "median_tpot_ms": 20.846842278524452, "std_tpot_ms": 2.433949297342979, "p99_tpot_ms": 26.92992232441266, "mean_itl_ms": 21.162570676283387, "median_itl_ms": 19.59298853762448, "std_itl_ms": 11.532776164590596, "p95_itl_ms": 21.707738342229277, "p99_itl_ms": 80.35093266516924, "concurrency": 57.13435231502565, "accept_length": null, "max_output_tokens_per_s": 3383.0, "max_concurrent_requests": 73}
+{"tag": "lightllm-flashinfer", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 64, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": null, "duration": 61.45518663688563, "completed": 320, "total_input_tokens": 158939, "total_input_text_tokens": 158939, "total_input_vision_tokens": 0, "total_output_tokens": 170134, "total_output_tokens_retokenized": 170000, "request_throughput": 5.207046264308875, "input_throughput": 2586.2585193843383, "output_throughput": 2768.4237785372693, "total_throughput": 5354.682297921608, "mean_e2e_latency_ms": 10922.24139619575, "median_e2e_latency_ms": 11354.776699910872, "std_e2e_latency_ms": 5564.939199084309, "p90_e2e_latency_ms": 18001.203981018625, "p99_e2e_latency_ms": 21854.658021384384, "mean_ttft_ms": 137.85723637993215, "median_ttft_ms": 92.63190545607358, "std_ttft_ms": 111.70441609815067, "p99_ttft_ms": 531.437413871754, "mean_tpot_ms": 20.47186250275375, "median_tpot_ms": 19.981252363955022, "std_tpot_ms": 2.0238247748229328, "p99_tpot_ms": 26.164035337201277, "mean_itl_ms": 20.334099341341997, "median_itl_ms": 19.39774421043694, "std_itl_ms": 12.074799257794227, "p95_itl_ms": 21.160434209741645, "p99_itl_ms": 94.60746685508639, "concurrency": 56.87261625994083, "accept_length": null, "max_output_tokens_per_s": 3439.0, "max_concurrent_requests": 76}
+{"tag": "lightllm-triton", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 64, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": null, "duration": 72.93456637696363, "completed": 320, "total_input_tokens": 158939, "total_input_text_tokens": 158939, "total_input_vision_tokens": 0, "total_output_tokens": 170134, "total_output_tokens_retokenized": 169880, "request_throughput": 4.387494378811745, "input_throughput": 2179.1999033561247, "output_throughput": 2332.693652014867, "total_throughput": 4511.893555370992, "mean_e2e_latency_ms": 12913.627206679666, "median_e2e_latency_ms": 13505.511358496733, "std_e2e_latency_ms": 6557.100403482816, "p90_e2e_latency_ms": 21201.299887429923, "p99_e2e_latency_ms": 25048.928881627508, "mean_ttft_ms": 123.85591540514724, "median_ttft_ms": 92.86466846242547, "std_ttft_ms": 61.161896228451845, "p99_ttft_ms": 276.2570247449912, "mean_tpot_ms": 24.217108368398687, "median_tpot_ms": 23.87942465522868, "std_tpot_ms": 1.7713138005549998, "p99_tpot_ms": 28.78332211641795, "mean_itl_ms": 24.109886530009778, "median_itl_ms": 23.613386088982224, "std_itl_ms": 10.00878708819206, "p95_itl_ms": 24.657311802729964, "p99_itl_ms": 80.39279339835048, "concurrency": 56.65846677937745, "accept_length": null, "max_output_tokens_per_s": 2932.0, "max_concurrent_requests": 74}
diff --git a/last_bench/sglang-oai_0123_320_1000_8000.jsonl b/last_bench/sglang-oai_0123_320_1000_8000.jsonl
new file mode 100644
index 000000000..15f9e8e1c
--- /dev/null
+++ b/last_bench/sglang-oai_0123_320_1000_8000.jsonl
@@ -0,0 +1,7 @@
+{"tag": "triton", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 64, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 8000, "random_range_ratio": 0.0, "server_info": {"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30000, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": null, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 604505033, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "triton", "decode_attention_backend": null, "prefill_attention_backend": null, "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "status": "ready", "max_total_num_tokens": 1386224, "max_req_input_len": 202746, "internal_states": [{"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30000, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": 3500, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 604505033, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "triton", "decode_attention_backend": "triton", "prefill_attention_backend": "triton", "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": true, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "use_mla_backend": true, "last_gen_throughput": 60.77974941333156, "memory_usage": {"weight": 56.33, "kvcache": 69.9, "token_capacity": 1386224, "graph": 1.69}, "effective_max_running_requests_per_dp": 3500}], "version": "0.0.0.dev1+g1e8e0cca2"}, "duration": 795.775275953114, "completed": 320, "total_input_tokens": 158939, "total_input_text_tokens": 158939, "total_input_vision_tokens": 0, "total_output_tokens": 1301025, "total_output_tokens_retokenized": 1290119, "request_throughput": 0.40212357642894897, "input_throughput": 199.72849723137725, "output_throughput": 1634.9150813233541, "total_throughput": 1834.6435785547314, "mean_e2e_latency_ms": 139349.8814202183, "median_e2e_latency_ms": 143803.1537218485, "std_e2e_latency_ms": 78727.69232926994, "p90_e2e_latency_ms": 247236.3009268418, "p99_e2e_latency_ms": 278858.07280272944, "mean_ttft_ms": 331.6782287663955, "median_ttft_ms": 114.08616451080889, "std_ttft_ms": 449.4168910393588, "p99_ttft_ms": 1511.0559103870764, "mean_tpot_ms": 34.17474498115479, "median_tpot_ms": 35.2410475091058, "std_tpot_ms": 2.6142427195879208, "p99_tpot_ms": 37.41304436548783, "mean_itl_ms": 34.197111507755174, "median_itl_ms": 34.45811802521348, "std_itl_ms": 7.2511462249701575, "p95_itl_ms": 37.52578243147582, "p99_itl_ms": 75.70609046146272, "concurrency": 56.03587269164813, "accept_length": null, "max_output_tokens_per_s": 2752.0, "max_concurrent_requests": 68}
+{"tag": "fa3", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 64, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 8000, "random_range_ratio": 0.0, "server_info": {"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30001, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": null, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 398782077, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "fa3", "decode_attention_backend": null, "prefill_attention_backend": null, "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "status": "ready", "max_total_num_tokens": 1386225, "max_req_input_len": 202746, "internal_states": [{"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30001, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": 3500, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 398782077, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "fa3", "decode_attention_backend": "fa3", "prefill_attention_backend": "fa3", "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "use_mla_backend": true, "last_gen_throughput": 111.68147398118492, "memory_usage": {"weight": 56.33, "kvcache": 69.9, "token_capacity": 1386225, "graph": 2.36}, "effective_max_running_requests_per_dp": 3500}], "version": "0.0.0.dev1+g1e8e0cca2"}, "duration": 563.1268216208555, "completed": 320, "total_input_tokens": 158939, "total_input_text_tokens": 158939, "total_input_vision_tokens": 0, "total_output_tokens": 1301025, "total_output_tokens_retokenized": 1289330, "request_throughput": 0.5682556534582027, "input_throughput": 282.24370407810403, "output_throughput": 2310.358786063932, "total_throughput": 2592.602490142036, "mean_e2e_latency_ms": 99475.42591601159, "median_e2e_latency_ms": 102824.72171052359, "std_e2e_latency_ms": 55369.15182006764, "p90_e2e_latency_ms": 176665.20955280866, "p99_e2e_latency_ms": 194569.74888183875, "mean_ttft_ms": 284.5753706082178, "median_ttft_ms": 99.63217296171933, "std_ttft_ms": 370.4153459018554, "p99_ttft_ms": 1195.5236710840836, "mean_tpot_ms": 24.49880321564915, "median_tpot_ms": 24.85869417485708, "std_tpot_ms": 1.065935851021897, "p99_tpot_ms": 25.785650772846513, "mean_itl_ms": 24.39991724061233, "median_itl_ms": 24.06020206399262, "std_itl_ms": 6.6672981127611495, "p95_itl_ms": 24.959364032838494, "p99_itl_ms": 73.22543800109997, "concurrency": 56.5274731569362, "accept_length": null, "max_output_tokens_per_s": 2944.0, "max_concurrent_requests": 68}
+{"tag": "flashmla", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 64, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 8000, "random_range_ratio": 0.0, "server_info": {"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30002, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 64, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": null, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 379113072, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "flashmla", "decode_attention_backend": null, "prefill_attention_backend": null, "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "status": "ready", "max_total_num_tokens": 1386176, "max_req_input_len": 202746, "internal_states": [{"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30002, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 64, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": 3500, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 379113072, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "flashmla", "decode_attention_backend": "flashmla", "prefill_attention_backend": "flashmla", "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "use_mla_backend": true, "last_gen_throughput": 98.18409159571067, "memory_usage": {"weight": 56.33, "kvcache": 69.9, "token_capacity": 1386176, "graph": 1.21}, "effective_max_running_requests_per_dp": 3500}], "version": "0.0.0.dev1+g1e8e0cca2"}, "duration": 549.4739509860519, "completed": 320, "total_input_tokens": 158939, "total_input_text_tokens": 158939, "total_input_vision_tokens": 0, "total_output_tokens": 1301025, "total_output_tokens_retokenized": 1288321, "request_throughput": 0.582375196177631, "input_throughput": 289.256660328989, "output_throughput": 2367.7646550218824, "total_throughput": 2657.0213153508716, "mean_e2e_latency_ms": 96338.26554486295, "median_e2e_latency_ms": 99560.23909756914, "std_e2e_latency_ms": 53588.08448171708, "p90_e2e_latency_ms": 171176.718534017, "p99_e2e_latency_ms": 188599.8856568616, "mean_ttft_ms": 347.4133831434301, "median_ttft_ms": 120.82068901509047, "std_ttft_ms": 461.0518339640887, "p99_ttft_ms": 1505.2862889668907, "mean_tpot_ms": 23.73714903655295, "median_tpot_ms": 24.09194966855164, "std_tpot_ms": 1.1065674013938183, "p99_tpot_ms": 25.431691372342634, "mean_itl_ms": 23.620946834591376, "median_itl_ms": 23.033733014017344, "std_itl_ms": 8.992960795726017, "p95_itl_ms": 23.83332545869052, "p99_itl_ms": 96.43437403254211, "concurrency": 56.10501629610227, "accept_length": null, "max_output_tokens_per_s": 3008.0, "max_concurrent_requests": 68}
+{"tag": "flashinfer", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 64, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 8000, "random_range_ratio": 0.0, "server_info": {"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30003, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": null, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 253908945, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "flashinfer", "decode_attention_backend": null, "prefill_attention_backend": null, "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "status": "ready", "max_total_num_tokens": 1386222, "max_req_input_len": 202746, "internal_states": [{"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30003, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": 3500, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 253908945, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "flashinfer", "decode_attention_backend": "flashinfer", "prefill_attention_backend": "flashinfer", "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "use_mla_backend": true, "last_gen_throughput": 114.74399660365336, "memory_usage": {"weight": 56.33, "kvcache": 69.9, "token_capacity": 1386222, "graph": 1.71}, "effective_max_running_requests_per_dp": 3500}], "version": "0.0.0.dev1+g1e8e0cca2"}, "duration": 497.495255718939, "completed": 320, "total_input_tokens": 158939, "total_input_text_tokens": 158939, "total_input_vision_tokens": 0, "total_output_tokens": 1301025, "total_output_tokens_retokenized": 1171209, "request_throughput": 0.6432222143255667, "input_throughput": 319.4784235084101, "output_throughput": 2615.150566852876, "total_throughput": 2934.6289903612865, "mean_e2e_latency_ms": 87646.07776943667, "median_e2e_latency_ms": 90406.89503098838, "std_e2e_latency_ms": 48906.1312695597, "p90_e2e_latency_ms": 156414.02036293878, "p99_e2e_latency_ms": 172093.0460711522, "mean_ttft_ms": 289.28253828853485, "median_ttft_ms": 115.20864104386419, "std_ttft_ms": 356.69460164728656, "p99_ttft_ms": 1172.9512733081356, "mean_tpot_ms": 21.568914790170023, "median_tpot_ms": 21.93780025128241, "std_tpot_ms": 1.0788863103757071, "p99_tpot_ms": 22.959191481056195, "mean_itl_ms": 21.979791418337847, "median_itl_ms": 20.824731094762683, "std_itl_ms": 139.14425778232058, "p95_itl_ms": 22.08241473417729, "p99_itl_ms": 91.44890741910787, "concurrency": 56.37590421980788, "accept_length": null, "max_output_tokens_per_s": 3336.0, "max_concurrent_requests": 68}
+{"tag": "lightllm-fa3", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 64, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 8000, "random_range_ratio": 0.0, "server_info": null, "duration": 489.09222918911837, "completed": 320, "total_input_tokens": 158939, "total_input_text_tokens": 158939, "total_input_vision_tokens": 0, "total_output_tokens": 1301025, "total_output_tokens_retokenized": 1300503, "request_throughput": 0.654273326997115, "input_throughput": 324.9673384987327, "output_throughput": 2660.0811101763175, "total_throughput": 2985.0484486750506, "mean_e2e_latency_ms": 87208.0360039814, "median_e2e_latency_ms": 90027.0975221647, "std_e2e_latency_ms": 48689.53844942357, "p90_e2e_latency_ms": 155637.0125748683, "p99_e2e_latency_ms": 171588.75004097354, "mean_ttft_ms": 170.48821544085513, "median_ttft_ms": 121.7766614863649, "std_ttft_ms": 105.2517967311711, "p99_ttft_ms": 400.0959781301208, "mean_tpot_ms": 21.505252597332344, "median_tpot_ms": 21.96794476552389, "std_tpot_ms": 1.1776802400447766, "p99_tpot_ms": 23.08266933652801, "mean_itl_ms": 21.418628915344538, "median_itl_ms": 21.57716895453632, "std_itl_ms": 6.864509437259774, "p95_itl_ms": 22.48724412638694, "p99_itl_ms": 76.17134404601528, "concurrency": 57.05789185720911, "accept_length": null, "max_output_tokens_per_s": 3390.0, "max_concurrent_requests": 68}
+{"tag": "lightllm-flashinfer", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 64, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 8000, "random_range_ratio": 0.0, "server_info": null, "duration": 484.7376974809449, "completed": 320, "total_input_tokens": 158939, "total_input_text_tokens": 158939, "total_input_vision_tokens": 0, "total_output_tokens": 1301025, "total_output_tokens_retokenized": 1300473, "request_throughput": 0.6601508437717065, "input_throughput": 327.8866092444727, "output_throughput": 2683.9773484940138, "total_throughput": 3011.8639577384865, "mean_e2e_latency_ms": 86270.7500121207, "median_e2e_latency_ms": 89360.22441205569, "std_e2e_latency_ms": 48155.0270541477, "p90_e2e_latency_ms": 154581.4763627248, "p99_e2e_latency_ms": 169827.765711823, "mean_ttft_ms": 213.335813806043, "median_ttft_ms": 139.8405219661072, "std_ttft_ms": 155.78232331585778, "p99_ttft_ms": 557.104211600963, "mean_tpot_ms": 21.26728059280753, "median_tpot_ms": 21.71698714107755, "std_tpot_ms": 1.217376459708817, "p99_tpot_ms": 23.195757839312797, "mean_itl_ms": 21.17951751529825, "median_itl_ms": 19.90097260568291, "std_itl_ms": 8.868560849765684, "p95_itl_ms": 22.173493867740035, "p99_itl_ms": 94.8113461374305, "concurrency": 56.95170841331944, "accept_length": null, "max_output_tokens_per_s": 3455.0, "max_concurrent_requests": 68}
+{"tag": "lightllm-triton", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 64, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 8000, "random_range_ratio": 0.0, "server_info": null, "duration": 760.4980736661237, "completed": 320, "total_input_tokens": 158939, "total_input_text_tokens": 158939, "total_input_vision_tokens": 0, "total_output_tokens": 1301025, "total_output_tokens_retokenized": 1300506, "request_throughput": 0.42077687121202023, "input_throughput": 208.99329729239776, "output_throughput": 1710.7538402144332, "total_throughput": 1919.747137506831, "mean_e2e_latency_ms": 133178.9838919889, "median_e2e_latency_ms": 137120.54603244178, "std_e2e_latency_ms": 75561.65705893908, "p90_e2e_latency_ms": 238436.7804493755, "p99_e2e_latency_ms": 266451.137662374, "mean_ttft_ms": 221.37831723230192, "median_ttft_ms": 122.04821163322777, "std_ttft_ms": 194.03500611102027, "p99_ttft_ms": 679.4678735616617, "mean_tpot_ms": 32.663685042973555, "median_tpot_ms": 33.72062742356595, "std_tpot_ms": 2.9253090523981875, "p99_tpot_ms": 36.29193072595595, "mean_itl_ms": 32.71866235950269, "median_itl_ms": 32.39020751789212, "std_itl_ms": 6.883452147345264, "p95_itl_ms": 36.488556885160506, "p99_itl_ms": 78.23992114746945, "concurrency": 56.038636153267134, "accept_length": null, "max_output_tokens_per_s": 2941.0, "max_concurrent_requests": 67}
diff --git a/last_bench/sglang-oai_0123_320_8000_1000.jsonl b/last_bench/sglang-oai_0123_320_8000_1000.jsonl
new file mode 100644
index 000000000..58c0b7aab
--- /dev/null
+++ b/last_bench/sglang-oai_0123_320_8000_1000.jsonl
@@ -0,0 +1,7 @@
+{"tag": "triton", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 64, "sharegpt_output_len": null, "random_input_len": 8000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": {"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30000, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": null, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 604505033, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "triton", "decode_attention_backend": null, "prefill_attention_backend": null, "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "status": "ready", "max_total_num_tokens": 1386224, "max_req_input_len": 202746, "internal_states": [{"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30000, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": 3500, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 604505033, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "triton", "decode_attention_backend": "triton", "prefill_attention_backend": "triton", "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": true, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "use_mla_backend": true, "last_gen_throughput": 179.99204346640389, "memory_usage": {"weight": 56.33, "kvcache": 69.9, "token_capacity": 1386224, "graph": 1.69}, "effective_max_running_requests_per_dp": 3500}], "version": "0.0.0.dev1+g1e8e0cca2"}, "duration": 213.6918999250047, "completed": 320, "total_input_tokens": 1273893, "total_input_text_tokens": 1273893, "total_input_vision_tokens": 0, "total_output_tokens": 170000, "total_output_tokens_retokenized": 169253, "request_throughput": 1.4974830590785342, "input_throughput": 5961.353708058535, "output_throughput": 795.5378751354713, "total_throughput": 6756.891583194006, "mean_e2e_latency_ms": 40317.63803735594, "median_e2e_latency_ms": 38796.292787534185, "std_e2e_latency_ms": 20574.065110792944, "p90_e2e_latency_ms": 68181.18443279527, "p99_e2e_latency_ms": 86222.15473645601, "mean_ttft_ms": 3873.7211492632923, "median_ttft_ms": 419.3346749525517, "std_ttft_ms": 7957.863948654817, "p99_ttft_ms": 30295.651319040917, "mean_tpot_ms": 73.12630469464084, "median_tpot_ms": 72.57030575516674, "std_tpot_ms": 30.291116863543795, "p99_tpot_ms": 200.25694315823512, "mean_itl_ms": 68.72528059291209, "median_itl_ms": 40.35702208057046, "std_itl_ms": 317.60260508162276, "p95_itl_ms": 215.18637766130269, "p99_itl_ms": 464.46666873060167, "concurrency": 60.37497994300084, "accept_length": null, "max_output_tokens_per_s": 1664.0, "max_concurrent_requests": 68}
+{"tag": "fa3", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 64, "sharegpt_output_len": null, "random_input_len": 8000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": {"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30001, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": null, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 398782077, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "fa3", "decode_attention_backend": null, "prefill_attention_backend": null, "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "status": "ready", "max_total_num_tokens": 1386225, "max_req_input_len": 202746, "internal_states": [{"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30001, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": 3500, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 398782077, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "fa3", "decode_attention_backend": "fa3", "prefill_attention_backend": "fa3", "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "use_mla_backend": true, "last_gen_throughput": 260.871967377065, "memory_usage": {"weight": 56.33, "kvcache": 69.9, "token_capacity": 1386225, "graph": 2.36}, "effective_max_running_requests_per_dp": 3500}], "version": "0.0.0.dev1+g1e8e0cca2"}, "duration": 116.99378763791174, "completed": 320, "total_input_tokens": 1273893, "total_input_text_tokens": 1273893, "total_input_vision_tokens": 0, "total_output_tokens": 170000, "total_output_tokens_retokenized": 169199, "request_throughput": 2.735187965624119, "input_throughput": 10888.552509665018, "output_throughput": 1453.0686067378133, "total_throughput": 12341.621116402832, "mean_e2e_latency_ms": 21713.53677811203, "median_e2e_latency_ms": 20956.929823034443, "std_e2e_latency_ms": 10695.828486107339, "p90_e2e_latency_ms": 36470.67303787918, "p99_e2e_latency_ms": 42708.998922796454, "mean_ttft_ms": 1138.065914130857, "median_ttft_ms": 244.04411506839097, "std_ttft_ms": 2117.0216766259114, "p99_ttft_ms": 8315.621516695246, "mean_tpot_ms": 40.13203695065518, "median_tpot_ms": 40.90892274458055, "std_tpot_ms": 9.072889948810449, "p99_tpot_ms": 70.9889798734412, "mean_itl_ms": 38.8005202036452, "median_itl_ms": 25.11462802067399, "std_itl_ms": 92.39345109467033, "p95_itl_ms": 121.36952648870647, "p99_itl_ms": 196.49121300317347, "concurrency": 59.390604486628725, "accept_length": null, "max_output_tokens_per_s": 2419.0, "max_concurrent_requests": 69}
+{"tag": "flashmla", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 64, "sharegpt_output_len": null, "random_input_len": 8000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": {"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30002, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 64, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": null, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 379113072, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "flashmla", "decode_attention_backend": null, "prefill_attention_backend": null, "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "status": "ready", "max_total_num_tokens": 1386176, "max_req_input_len": 202746, "internal_states": [{"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30002, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 64, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": 3500, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 379113072, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "flashmla", "decode_attention_backend": "flashmla", "prefill_attention_backend": "flashmla", "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "use_mla_backend": true, "last_gen_throughput": 233.5999098891422, "memory_usage": {"weight": 56.33, "kvcache": 69.9, "token_capacity": 1386176, "graph": 1.21}, "effective_max_running_requests_per_dp": 3500}], "version": "0.0.0.dev1+g1e8e0cca2"}, "duration": 112.81122839893214, "completed": 320, "total_input_tokens": 1273893, "total_input_text_tokens": 1273893, "total_input_vision_tokens": 0, "total_output_tokens": 170000, "total_output_tokens_retokenized": 169153, "request_throughput": 2.8365970705361905, "input_throughput": 11292.253599926747, "output_throughput": 1506.942193722351, "total_throughput": 12799.195793649098, "mean_e2e_latency_ms": 20836.65436528754, "median_e2e_latency_ms": 20054.032715968788, "std_e2e_latency_ms": 10206.070387118045, "p90_e2e_latency_ms": 34869.302074844025, "p99_e2e_latency_ms": 41031.26083256677, "mean_ttft_ms": 1201.1233022982196, "median_ttft_ms": 222.6223434554413, "std_ttft_ms": 2285.9580876221967, "p99_ttft_ms": 8968.076999972109, "mean_tpot_ms": 38.38958789982542, "median_tpot_ms": 38.97802759513901, "std_tpot_ms": 9.349859411662857, "p99_tpot_ms": 72.9707564608667, "mean_itl_ms": 37.02905762269816, "median_itl_ms": 23.81886588409543, "std_itl_ms": 96.84834802975762, "p95_itl_ms": 109.55824161646888, "p99_itl_ms": 181.73167313216254, "concurrency": 59.105192732349764, "accept_length": null, "max_output_tokens_per_s": 2627.0, "max_concurrent_requests": 71}
+{"tag": "flashinfer", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 64, "sharegpt_output_len": null, "random_input_len": 8000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": {"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30003, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": null, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 253908945, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "flashinfer", "decode_attention_backend": null, "prefill_attention_backend": null, "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "status": "ready", "max_total_num_tokens": 1386222, "max_req_input_len": 202746, "internal_states": [{"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30003, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": 3500, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 253908945, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "flashinfer", "decode_attention_backend": "flashinfer", "prefill_attention_backend": "flashinfer", "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "use_mla_backend": true, "last_gen_throughput": 274.27057526732784, "memory_usage": {"weight": 56.33, "kvcache": 69.9, "token_capacity": 1386222, "graph": 1.71}, "effective_max_running_requests_per_dp": 3500}], "version": "0.0.0.dev1+g1e8e0cca2"}, "duration": 112.2128552980721, "completed": 320, "total_input_tokens": 1273893, "total_input_text_tokens": 1273893, "total_input_vision_tokens": 0, "total_output_tokens": 170000, "total_output_tokens_retokenized": 156092, "request_throughput": 2.8517231751208976, "input_throughput": 11352.469346013393, "output_throughput": 1514.977936782977, "total_throughput": 12867.44728279637, "mean_e2e_latency_ms": 20905.081391092244, "median_e2e_latency_ms": 20195.792312850244, "std_e2e_latency_ms": 10355.055639883163, "p90_e2e_latency_ms": 35295.89922316374, "p99_e2e_latency_ms": 41340.240367348306, "mean_ttft_ms": 1119.6136578902951, "median_ttft_ms": 241.6940564289689, "std_ttft_ms": 2078.1090631361863, "p99_ttft_ms": 8247.429187672678, "mean_tpot_ms": 38.66073936228574, "median_tpot_ms": 39.597893979087445, "std_tpot_ms": 9.283534476702123, "p99_tpot_ms": 70.24860901496342, "mean_itl_ms": 37.540270646483414, "median_itl_ms": 23.099064477719367, "std_itl_ms": 111.93775803445361, "p95_itl_ms": 118.23046104982495, "p99_itl_ms": 195.68518341518939, "concurrency": 59.61550508076637, "accept_length": null, "max_output_tokens_per_s": 2595.0, "max_concurrent_requests": 69}
+{"tag": "lightllm-fa3", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 64, "sharegpt_output_len": null, "random_input_len": 8000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": null, "duration": 85.61151024489664, "completed": 320, "total_input_tokens": 1273893, "total_input_text_tokens": 1273893, "total_input_vision_tokens": 0, "total_output_tokens": 170000, "total_output_tokens_retokenized": 169953, "request_throughput": 3.737815149909418, "input_throughput": 14879.926733636119, "output_throughput": 1985.7142983893784, "total_throughput": 16865.641032025498, "mean_e2e_latency_ms": 15726.555139116681, "median_e2e_latency_ms": 15938.618876039982, "std_e2e_latency_ms": 8319.184834233261, "p90_e2e_latency_ms": 26862.930231867365, "p99_e2e_latency_ms": 32213.170791727025, "mean_ttft_ms": 277.19335837537074, "median_ttft_ms": 209.07252049073577, "std_ttft_ms": 191.42876900594652, "p99_ttft_ms": 673.9466302050278, "mean_tpot_ms": 29.049475895770772, "median_tpot_ms": 31.37292958098005, "std_tpot_ms": 4.868888682964287, "p99_tpot_ms": 35.96265073154, "mean_itl_ms": 29.144233108824857, "median_itl_ms": 21.852319943718612, "std_itl_ms": 28.374227292396913, "p95_itl_ms": 81.86526254285126, "p99_itl_ms": 147.18675724463537, "concurrency": 58.782956054876145, "accept_length": null, "max_output_tokens_per_s": 2941.0, "max_concurrent_requests": 71}
+{"tag": "lightllm-flashinfer", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 64, "sharegpt_output_len": null, "random_input_len": 8000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": null, "duration": 86.55054738209583, "completed": 320, "total_input_tokens": 1273893, "total_input_text_tokens": 1273893, "total_input_vision_tokens": 0, "total_output_tokens": 170000, "total_output_tokens_retokenized": 169856, "request_throughput": 3.697261423284729, "input_throughput": 14718.485769663917, "output_throughput": 1964.1701311200122, "total_throughput": 16682.655900783928, "mean_e2e_latency_ms": 15904.437752439117, "median_e2e_latency_ms": 16162.462773616426, "std_e2e_latency_ms": 8427.974845182198, "p90_e2e_latency_ms": 27384.40113137476, "p99_e2e_latency_ms": 32415.15740617411, "mean_ttft_ms": 273.31787959701614, "median_ttft_ms": 202.60415493976325, "std_ttft_ms": 191.3256412390505, "p99_ttft_ms": 717.0455491333269, "mean_tpot_ms": 29.321787822406232, "median_tpot_ms": 31.71495196018244, "std_tpot_ms": 5.006722794831577, "p99_tpot_ms": 35.875299398168835, "mean_itl_ms": 29.487281758663784, "median_itl_ms": 21.87118213623762, "std_itl_ms": 26.885584181443708, "p95_itl_ms": 99.2627366213128, "p99_itl_ms": 134.71757725346833, "concurrency": 58.80286416112642, "accept_length": null, "max_output_tokens_per_s": 3059.0, "max_concurrent_requests": 71}
+{"tag": "lightllm-triton", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 64, "sharegpt_output_len": null, "random_input_len": 8000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": null, "duration": 135.46758720418438, "completed": 320, "total_input_tokens": 1273893, "total_input_text_tokens": 1273893, "total_input_vision_tokens": 0, "total_output_tokens": 170000, "total_output_tokens_retokenized": 169774, "request_throughput": 2.3621886726134567, "input_throughput": 9403.67379600492, "output_throughput": 1254.9127323258988, "total_throughput": 10658.586528330818, "mean_e2e_latency_ms": 24741.570841200155, "median_e2e_latency_ms": 25289.254195871763, "std_e2e_latency_ms": 13030.010406347368, "p90_e2e_latency_ms": 42505.82551364787, "p99_e2e_latency_ms": 50194.15757432813, "mean_ttft_ms": 313.1668246234767, "median_ttft_ms": 267.09420257247984, "std_ttft_ms": 190.20965705094397, "p99_ttft_ms": 717.6051298505627, "mean_tpot_ms": 46.051251688612794, "median_tpot_ms": 48.8459376461558, "std_tpot_ms": 6.861405373140807, "p99_tpot_ms": 54.66476245289881, "mean_itl_ms": 46.08479912195559, "median_itl_ms": 37.94613853096962, "std_itl_ms": 35.57256149381119, "p95_itl_ms": 113.83506678976119, "p99_itl_ms": 199.79378876043484, "concurrency": 58.4442583837464, "accept_length": null, "max_output_tokens_per_s": 1783.0, "max_concurrent_requests": 70}
diff --git a/last_bench/sglang-oai_0123_80_1000_1000.jsonl b/last_bench/sglang-oai_0123_80_1000_1000.jsonl
new file mode 100644
index 000000000..1063a8aa5
--- /dev/null
+++ b/last_bench/sglang-oai_0123_80_1000_1000.jsonl
@@ -0,0 +1,7 @@
+{"tag": "triton", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 16, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": null, "duration": 58.75812066299841, "completed": 80, "total_input_tokens": 39668, "total_input_text_tokens": 39668, "total_input_vision_tokens": 0, "total_output_tokens": 40805, "total_output_tokens_retokenized": 40774, "request_throughput": 1.3615139336881172, "input_throughput": 675.1066840192528, "output_throughput": 694.4572008017952, "total_throughput": 1369.563884821048, "mean_e2e_latency_ms": 10290.420460538007, "median_e2e_latency_ms": 10854.726839112118, "std_e2e_latency_ms": 5481.416227557806, "p90_e2e_latency_ms": 17087.210976914506, "p99_e2e_latency_ms": 20273.29670665319, "mean_ttft_ms": 392.1325499599334, "median_ttft_ms": 109.52406388241798, "std_ttft_ms": 567.0151769289985, "p99_ttft_ms": 1561.811125462409, "mean_tpot_ms": 19.978452692457562, "median_tpot_ms": 19.987008761459922, "std_tpot_ms": 3.9156782596281814, "p99_tpot_ms": 28.60846174028575, "mean_itl_ms": 19.448439157011858, "median_itl_ms": 18.109396449290216, "std_itl_ms": 11.359371226695139, "p95_itl_ms": 19.791446393355727, "p99_itl_ms": 90.75973493745548, "concurrency": 14.010550840531788, "accept_length": null, "max_output_tokens_per_s": 896.0, "max_concurrent_requests": 21}
+{"tag": "fa3", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 16, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": null, "duration": 53.40713605610654, "completed": 80, "total_input_tokens": 39668, "total_input_text_tokens": 39668, "total_input_vision_tokens": 0, "total_output_tokens": 40805, "total_output_tokens_retokenized": 40773, "request_throughput": 1.4979271668107514, "input_throughput": 742.7471856631112, "output_throughput": 764.036475521409, "total_throughput": 1506.78366118452, "mean_e2e_latency_ms": 9317.695084243314, "median_e2e_latency_ms": 10052.179563441314, "std_e2e_latency_ms": 5047.758937948556, "p90_e2e_latency_ms": 15426.76037205384, "p99_e2e_latency_ms": 18604.811251361385, "mean_ttft_ms": 153.06907128542662, "median_ttft_ms": 107.1817415067926, "std_ttft_ms": 91.4799906514664, "p99_ttft_ms": 337.4836889700964, "mean_tpot_ms": 18.252970567128944, "median_tpot_ms": 18.4298531433687, "std_tpot_ms": 1.4893520951460373, "p99_tpot_ms": 21.899579410787737, "mean_itl_ms": 18.004708145534345, "median_itl_ms": 16.874348046258092, "std_itl_ms": 10.496979411646434, "p95_itl_ms": 17.423551063984632, "p99_itl_ms": 88.60507588833578, "concurrency": 13.957228598747054, "accept_length": null, "max_output_tokens_per_s": 976.0, "max_concurrent_requests": 21}
+{"tag": "flashmla", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 16, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": null, "duration": 55.16790589294396, "completed": 80, "total_input_tokens": 39668, "total_input_text_tokens": 39668, "total_input_vision_tokens": 0, "total_output_tokens": 40805, "total_output_tokens_retokenized": 40786, "request_throughput": 1.4501184829317961, "input_throughput": 719.0412497617311, "output_throughput": 739.6510587003993, "total_throughput": 1458.6923084621305, "mean_e2e_latency_ms": 9602.716599009, "median_e2e_latency_ms": 10392.598490929231, "std_e2e_latency_ms": 5178.017009753717, "p90_e2e_latency_ms": 15825.043794442907, "p99_e2e_latency_ms": 19127.044549903363, "mean_ttft_ms": 184.86261386133265, "median_ttft_ms": 114.34125760570168, "std_ttft_ms": 122.27272272576788, "p99_ttft_ms": 429.63656508596614, "mean_tpot_ms": 18.84696523455597, "median_tpot_ms": 19.015752477440632, "std_tpot_ms": 1.875303441368142, "p99_tpot_ms": 25.919375823585423, "mean_itl_ms": 18.50310546566746, "median_itl_ms": 16.952354926615953, "std_itl_ms": 12.890325841392592, "p95_itl_ms": 17.591408849693835, "p99_itl_ms": 94.11955502349883, "concurrency": 13.925076826578909, "accept_length": null, "max_output_tokens_per_s": 976.0, "max_concurrent_requests": 20}
+{"tag": "flashinfer", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 16, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": null, "duration": 47.810525686945766, "completed": 80, "total_input_tokens": 39668, "total_input_text_tokens": 39668, "total_input_vision_tokens": 0, "total_output_tokens": 40805, "total_output_tokens_retokenized": 37815, "request_throughput": 1.6732717084899837, "input_throughput": 829.6917766547584, "output_throughput": 853.4731508116723, "total_throughput": 1683.1649274664308, "mean_e2e_latency_ms": 8313.95380628528, "median_e2e_latency_ms": 8911.577637074515, "std_e2e_latency_ms": 4493.426788691228, "p90_e2e_latency_ms": 13624.376828689134, "p99_e2e_latency_ms": 16482.360729521602, "mean_ttft_ms": 166.6187765513314, "median_ttft_ms": 107.28733043652028, "std_ttft_ms": 107.45469984138889, "p99_ttft_ms": 383.4053916204721, "mean_tpot_ms": 16.219748328613264, "median_tpot_ms": 16.46422699847982, "std_tpot_ms": 1.3754006608474518, "p99_tpot_ms": 20.223300155676718, "mean_itl_ms": 16.117470535603395, "median_itl_ms": 14.493613503873348, "std_itl_ms": 23.759563258631825, "p95_itl_ms": 15.231409226544201, "p99_itl_ms": 91.41340630594641, "concurrency": 13.911503689749775, "accept_length": null, "max_output_tokens_per_s": 1120.0, "max_concurrent_requests": 21}
+{"tag": "lightllm-fa3", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 16, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": null, "duration": 39.198021210031584, "completed": 80, "total_input_tokens": 39668, "total_input_text_tokens": 39668, "total_input_vision_tokens": 0, "total_output_tokens": 40805, "total_output_tokens_retokenized": 40779, "request_throughput": 2.0409193507841246, "input_throughput": 1011.9898600863081, "output_throughput": 1040.9964263593274, "total_throughput": 2052.9862864456354, "mean_e2e_latency_ms": 6749.5986932568485, "median_e2e_latency_ms": 7180.660139070824, "std_e2e_latency_ms": 3674.73758483094, "p90_e2e_latency_ms": 11592.498308001086, "p99_e2e_latency_ms": 13352.02830265276, "mean_ttft_ms": 65.81717804365326, "median_ttft_ms": 65.59701240621507, "std_ttft_ms": 16.49426206282749, "p99_ttft_ms": 103.18048950051889, "mean_tpot_ms": 13.250411406873202, "median_tpot_ms": 13.433356619192136, "std_tpot_ms": 0.6795819356745668, "p99_tpot_ms": 13.671014163799812, "mean_itl_ms": 13.132804988486683, "median_itl_ms": 12.713762000203133, "std_itl_ms": 1.5002947899704477, "p95_itl_ms": 14.877910260111094, "p99_itl_ms": 15.338663104921581, "concurrency": 13.775386583095141, "accept_length": null, "max_output_tokens_per_s": 1232.0, "max_concurrent_requests": 22}
+{"tag": "lightllm-flashinfer", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 16, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": null, "duration": 38.999191141920164, "completed": 80, "total_input_tokens": 39668, "total_input_text_tokens": 39668, "total_input_vision_tokens": 0, "total_output_tokens": 40805, "total_output_tokens_retokenized": 40769, "request_throughput": 2.0513245956531683, "input_throughput": 1017.1493007546235, "output_throughput": 1046.303751570344, "total_throughput": 2063.4530523249678, "mean_e2e_latency_ms": 6713.866910606157, "median_e2e_latency_ms": 7081.338235875592, "std_e2e_latency_ms": 3648.244918498425, "p90_e2e_latency_ms": 11412.26524873637, "p99_e2e_latency_ms": 13317.006446453743, "mean_ttft_ms": 63.56341727369, "median_ttft_ms": 54.98786806128919, "std_ttft_ms": 18.798902727973804, "p99_ttft_ms": 125.0289089186117, "mean_tpot_ms": 13.195385200587856, "median_tpot_ms": 13.332349463977804, "std_tpot_ms": 0.6546048286199364, "p99_tpot_ms": 13.744833133619448, "mean_itl_ms": 13.070555955399977, "median_itl_ms": 12.730133486911654, "std_itl_ms": 1.5181346036760426, "p95_itl_ms": 14.938965067267418, "p99_itl_ms": 15.352271823212506, "concurrency": 13.77232032566836, "accept_length": null, "max_output_tokens_per_s": 1248.0, "max_concurrent_requests": 21}
+{"tag": "lightllm-triton", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 16, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": null, "duration": 46.33363204495981, "completed": 80, "total_input_tokens": 39668, "total_input_text_tokens": 39668, "total_input_vision_tokens": 0, "total_output_tokens": 40805, "total_output_tokens_retokenized": 40771, "request_throughput": 1.7266075735735986, "input_throughput": 856.1383653564689, "output_throughput": 880.6777754958837, "total_throughput": 1736.8161408523526, "mean_e2e_latency_ms": 8104.25450251787, "median_e2e_latency_ms": 8717.964101932012, "std_e2e_latency_ms": 4437.408088077443, "p90_e2e_latency_ms": 13669.85152466223, "p99_e2e_latency_ms": 16101.117293201383, "mean_ttft_ms": 67.62399404251482, "median_ttft_ms": 64.90619748365134, "std_ttft_ms": 15.426835098115847, "p99_ttft_ms": 97.62805712874979, "mean_tpot_ms": 15.921614299470969, "median_tpot_ms": 16.282790201281642, "std_tpot_ms": 1.0562744090505234, "p99_tpot_ms": 16.695227297923605, "mean_itl_ms": 15.791763753396411, "median_itl_ms": 16.721565974876285, "std_itl_ms": 1.887789638138326, "p95_itl_ms": 17.124590510502458, "p99_itl_ms": 18.57489356771111, "concurrency": 13.99286720221529, "accept_length": null, "max_output_tokens_per_s": 1040.0, "max_concurrent_requests": 21}
diff --git a/last_bench/sglang-oai_0123_80_1000_8000.jsonl b/last_bench/sglang-oai_0123_80_1000_8000.jsonl
new file mode 100644
index 000000000..28b40d7f4
--- /dev/null
+++ b/last_bench/sglang-oai_0123_80_1000_8000.jsonl
@@ -0,0 +1,7 @@
+{"tag": "triton", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 16, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 8000, "random_range_ratio": 0.0, "server_info": {"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30000, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": null, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 604505033, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "triton", "decode_attention_backend": null, "prefill_attention_backend": null, "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "status": "ready", "max_total_num_tokens": 1386224, "max_req_input_len": 202746, "internal_states": [{"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30000, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": 3500, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 604505033, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "triton", "decode_attention_backend": "triton", "prefill_attention_backend": "triton", "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": true, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "use_mla_backend": true, "last_gen_throughput": 65.3115398409416, "memory_usage": {"weight": 56.33, "kvcache": 69.9, "token_capacity": 1386224, "graph": 1.69}, "effective_max_running_requests_per_dp": 3500}], "version": "0.0.0.dev1+g1e8e0cca2"}, "duration": 510.3684002109803, "completed": 80, "total_input_tokens": 39668, "total_input_text_tokens": 39668, "total_input_vision_tokens": 0, "total_output_tokens": 318306, "total_output_tokens_retokenized": 318138, "request_throughput": 0.1567495165588797, "input_throughput": 77.7242477857205, "output_throughput": 623.6788952223845, "total_throughput": 701.403143008105, "mean_e2e_latency_ms": 88432.47628718382, "median_e2e_latency_ms": 85325.70422510616, "std_e2e_latency_ms": 53979.79939917359, "p90_e2e_latency_ms": 167731.33851240855, "p99_e2e_latency_ms": 178339.03011296637, "mean_ttft_ms": 250.5359776376281, "median_ttft_ms": 117.73683200590312, "std_ttft_ms": 279.6251500610817, "p99_ttft_ms": 880.0148263876326, "mean_tpot_ms": 22.046003086600507, "median_tpot_ms": 22.13179399591731, "std_tpot_ms": 1.311077596299769, "p99_tpot_ms": 23.973088838541113, "mean_itl_ms": 22.17242102734307, "median_itl_ms": 22.142937988974154, "std_itl_ms": 4.26715641637369, "p95_itl_ms": 24.86142996931448, "p99_itl_ms": 27.21198833314702, "concurrency": 13.861747906120655, "accept_length": null, "max_output_tokens_per_s": 929.0, "max_concurrent_requests": 18}
+{"tag": "fa3", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 16, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 8000, "random_range_ratio": 0.0, "server_info": {"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30001, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": null, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 398782077, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "fa3", "decode_attention_backend": null, "prefill_attention_backend": null, "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "status": "ready", "max_total_num_tokens": 1386225, "max_req_input_len": 202746, "internal_states": [{"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30001, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": 3500, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 398782077, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "fa3", "decode_attention_backend": "fa3", "prefill_attention_backend": "fa3", "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "use_mla_backend": true, "last_gen_throughput": 111.70549511187465, "memory_usage": {"weight": 56.33, "kvcache": 69.9, "token_capacity": 1386225, "graph": 2.36}, "effective_max_running_requests_per_dp": 3500}], "version": "0.0.0.dev1+g1e8e0cca2"}, "duration": 378.9256413059775, "completed": 80, "total_input_tokens": 39668, "total_input_text_tokens": 39668, "total_input_vision_tokens": 0, "total_output_tokens": 318306, "total_output_tokens_retokenized": 318087, "request_throughput": 0.21112321595413241, "input_throughput": 104.68544663085656, "output_throughput": 840.0223297187009, "total_throughput": 944.7077763495574, "mean_e2e_latency_ms": 66920.71866179758, "median_e2e_latency_ms": 67326.38371211942, "std_e2e_latency_ms": 40074.098965241894, "p90_e2e_latency_ms": 125597.93171191122, "p99_e2e_latency_ms": 134578.84816802104, "mean_ttft_ms": 214.4721156655578, "median_ttft_ms": 109.45712006650865, "std_ttft_ms": 219.49495984278872, "p99_ttft_ms": 720.4752121167257, "mean_tpot_ms": 16.863758946605724, "median_tpot_ms": 17.003689203956057, "std_tpot_ms": 0.6634634044921125, "p99_tpot_ms": 17.452971537097646, "mean_itl_ms": 16.771740067354198, "median_itl_ms": 16.822382458485663, "std_itl_ms": 4.257912709980094, "p95_itl_ms": 17.406025959644467, "p99_itl_ms": 17.852395898662504, "concurrency": 14.12851733784043, "accept_length": null, "max_output_tokens_per_s": 1008.0, "max_concurrent_requests": 19}
+{"tag": "flashmla", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 16, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 8000, "random_range_ratio": 0.0, "server_info": {"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30002, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 64, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": null, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 379113072, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "flashmla", "decode_attention_backend": null, "prefill_attention_backend": null, "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "status": "ready", "max_total_num_tokens": 1386176, "max_req_input_len": 202746, "internal_states": [{"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30002, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 64, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": 3500, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 379113072, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "flashmla", "decode_attention_backend": "flashmla", "prefill_attention_backend": "flashmla", "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "use_mla_backend": true, "last_gen_throughput": 88.10241504348274, "memory_usage": {"weight": 56.33, "kvcache": 69.9, "token_capacity": 1386176, "graph": 1.21}, "effective_max_running_requests_per_dp": 3500}], "version": "0.0.0.dev1+g1e8e0cca2"}, "duration": 388.088292311877, "completed": 80, "total_input_tokens": 39668, "total_input_text_tokens": 39668, "total_input_vision_tokens": 0, "total_output_tokens": 318306, "total_output_tokens_retokenized": 318054, "request_throughput": 0.20613865861150507, "input_throughput": 102.21385387251479, "output_throughput": 820.1896483499216, "total_throughput": 922.4035022224364, "mean_e2e_latency_ms": 67931.14652846998, "median_e2e_latency_ms": 68187.61040759273, "std_e2e_latency_ms": 40723.917366219466, "p90_e2e_latency_ms": 127324.08863757737, "p99_e2e_latency_ms": 136953.1988685066, "mean_ttft_ms": 294.5981682394631, "median_ttft_ms": 111.34997848421335, "std_ttft_ms": 363.3717001890983, "p99_ttft_ms": 1089.565416739788, "mean_tpot_ms": 17.089183333079028, "median_tpot_ms": 17.174373370728233, "std_tpot_ms": 0.5925423207604643, "p99_tpot_ms": 17.82896875321477, "mean_itl_ms": 17.006230745068496, "median_itl_ms": 16.955296974629164, "std_itl_ms": 4.835724451681868, "p95_itl_ms": 17.620283225551248, "p99_itl_ms": 18.00625145435333, "concurrency": 14.003235423320401, "accept_length": null, "max_output_tokens_per_s": 992.0, "max_concurrent_requests": 19}
+{"tag": "flashinfer", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 16, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 8000, "random_range_ratio": 0.0, "server_info": {"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30003, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": null, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 253908945, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "flashinfer", "decode_attention_backend": null, "prefill_attention_backend": null, "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "status": "ready", "max_total_num_tokens": 1386222, "max_req_input_len": 202746, "internal_states": [{"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30003, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": 3500, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 253908945, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "flashinfer", "decode_attention_backend": "flashinfer", "prefill_attention_backend": "flashinfer", "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "use_mla_backend": true, "last_gen_throughput": 115.96867184133801, "memory_usage": {"weight": 56.33, "kvcache": 69.9, "token_capacity": 1386222, "graph": 1.71}, "effective_max_running_requests_per_dp": 3500}], "version": "0.0.0.dev1+g1e8e0cca2"}, "duration": 336.1375814990606, "completed": 80, "total_input_tokens": 39668, "total_input_text_tokens": 39668, "total_input_vision_tokens": 0, "total_output_tokens": 318306, "total_output_tokens_retokenized": 289408, "request_throughput": 0.2379977854401965, "input_throughput": 118.01120191052145, "output_throughput": 946.95153865409, "total_throughput": 1064.9627405646113, "mean_e2e_latency_ms": 59016.91139079048, "median_e2e_latency_ms": 59690.270767663606, "std_e2e_latency_ms": 35464.544102828455, "p90_e2e_latency_ms": 110727.35556068365, "p99_e2e_latency_ms": 119681.62642030511, "mean_ttft_ms": 214.80026175267994, "median_ttft_ms": 109.64243847411126, "std_ttft_ms": 214.38356439875133, "p99_ttft_ms": 738.7441637879238, "mean_tpot_ms": 14.840095201035492, "median_tpot_ms": 14.978818821285847, "std_tpot_ms": 0.49894420942441764, "p99_tpot_ms": 15.18216584803879, "mean_itl_ms": 14.98574704604075, "median_itl_ms": 14.707728987559676, "std_itl_ms": 41.27780934363587, "p95_itl_ms": 15.214697923511267, "p99_itl_ms": 15.75610991567373, "concurrency": 14.045894214528444, "accept_length": null, "max_output_tokens_per_s": 1120.0, "max_concurrent_requests": 18}
+{"tag": "lightllm-fa3", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 16, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 8000, "random_range_ratio": 0.0, "server_info": null, "duration": 301.9964598421939, "completed": 80, "total_input_tokens": 39668, "total_input_text_tokens": 39668, "total_input_vision_tokens": 0, "total_output_tokens": 318306, "total_output_tokens_retokenized": 315950, "request_throughput": 0.26490376755344563, "input_throughput": 131.352533141376, "output_throughput": 1054.0057329358383, "total_throughput": 1185.3582660772142, "mean_e2e_latency_ms": 53554.92747395474, "median_e2e_latency_ms": 54525.81842453219, "std_e2e_latency_ms": 31906.4896132971, "p90_e2e_latency_ms": 98788.09053299484, "p99_e2e_latency_ms": 110253.29957607435, "mean_ttft_ms": 468.15795370785054, "median_ttft_ms": 129.87286353018135, "std_ttft_ms": 722.0932614354268, "p99_ttft_ms": 2356.574236305896, "mean_tpot_ms": 13.43742028334658, "median_tpot_ms": 13.60711470533312, "std_tpot_ms": 0.6681434813613275, "p99_tpot_ms": 14.401517221818835, "mean_itl_ms": 13.34789935342274, "median_itl_ms": 12.691037962213159, "std_itl_ms": 7.594706030906152, "p95_itl_ms": 14.856397034600377, "p99_itl_ms": 15.24846603162586, "concurrency": 14.186902058902145, "accept_length": null, "max_output_tokens_per_s": 1248.0, "max_concurrent_requests": 18}
+{"tag": "lightllm-flashinfer", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 16, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 8000, "random_range_ratio": 0.0, "server_info": null, "duration": 302.9897459298372, "completed": 80, "total_input_tokens": 39668, "total_input_text_tokens": 39668, "total_input_vision_tokens": 0, "total_output_tokens": 318306, "total_output_tokens_retokenized": 318107, "request_throughput": 0.26403533807551843, "input_throughput": 130.9219223847458, "output_throughput": 1050.5504040183246, "total_throughput": 1181.4723264030704, "mean_e2e_latency_ms": 53567.94852038438, "median_e2e_latency_ms": 54704.7826530179, "std_e2e_latency_ms": 32153.43528106655, "p90_e2e_latency_ms": 99366.40834102874, "p99_e2e_latency_ms": 109649.55897029256, "mean_ttft_ms": 206.28674371691886, "median_ttft_ms": 133.14349250867963, "std_ttft_ms": 166.94021679176737, "p99_ttft_ms": 701.204480342567, "mean_tpot_ms": 13.495440161882424, "median_tpot_ms": 13.699334770739123, "std_tpot_ms": 0.6245435190179904, "p99_tpot_ms": 14.051371312114576, "mean_itl_ms": 13.423134283187423, "median_itl_ms": 12.703799526207149, "std_itl_ms": 5.127250146021068, "p95_itl_ms": 14.870735118165612, "p99_itl_ms": 15.211140103638169, "concurrency": 14.143831397591656, "accept_length": null, "max_output_tokens_per_s": 1246.0, "max_concurrent_requests": 18}
+{"tag": "lightllm-triton", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 16, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 8000, "random_range_ratio": 0.0, "server_info": null, "duration": 469.9071873531211, "completed": 80, "total_input_tokens": 39668, "total_input_text_tokens": 39668, "total_input_vision_tokens": 0, "total_output_tokens": 318306, "total_output_tokens_retokenized": 318227, "request_throughput": 0.17024638514388674, "input_throughput": 84.41667007359624, "output_throughput": 677.3805733701251, "total_throughput": 761.7972434437214, "mean_e2e_latency_ms": 80378.25429544318, "median_e2e_latency_ms": 79052.61100351345, "std_e2e_latency_ms": 49463.25117134018, "p90_e2e_latency_ms": 152817.94724198992, "p99_e2e_latency_ms": 160306.36509028258, "mean_ttft_ms": 167.8032555442769, "median_ttft_ms": 133.42473388183862, "std_ttft_ms": 93.26758803871225, "p99_ttft_ms": 393.4040538291447, "mean_tpot_ms": 19.99498991112318, "median_tpot_ms": 20.358366379219007, "std_tpot_ms": 1.171219880118829, "p99_tpot_ms": 21.51192666823354, "mean_itl_ms": 20.179870766167, "median_itl_ms": 20.82169963978231, "std_itl_ms": 5.298449338488774, "p95_itl_ms": 22.915064496919513, "p99_itl_ms": 23.15957001177594, "concurrency": 13.684107237975288, "accept_length": null, "max_output_tokens_per_s": 1040.0, "max_concurrent_requests": 18}
diff --git a/last_bench/sglang-oai_0123_80_8000_1000.jsonl b/last_bench/sglang-oai_0123_80_8000_1000.jsonl
new file mode 100644
index 000000000..ec5185eb6
--- /dev/null
+++ b/last_bench/sglang-oai_0123_80_8000_1000.jsonl
@@ -0,0 +1,7 @@
+{"tag": "triton", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 16, "sharegpt_output_len": null, "random_input_len": 8000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": {"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30000, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": null, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 604505033, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "triton", "decode_attention_backend": null, "prefill_attention_backend": null, "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "status": "ready", "max_total_num_tokens": 1386224, "max_req_input_len": 202746, "internal_states": [{"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30000, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": 3500, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 604505033, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "triton", "decode_attention_backend": "triton", "prefill_attention_backend": "triton", "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": true, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "use_mla_backend": true, "last_gen_throughput": 151.9219302705729, "memory_usage": {"weight": 56.33, "kvcache": 69.9, "token_capacity": 1386224, "graph": 1.69}, "effective_max_running_requests_per_dp": 3500}], "version": "0.0.0.dev1+g1e8e0cca2"}, "duration": 91.72738593397662, "completed": 80, "total_input_tokens": 300020, "total_input_text_tokens": 300020, "total_input_vision_tokens": 0, "total_output_tokens": 41669, "total_output_tokens_retokenized": 41658, "request_throughput": 0.8721495678246218, "input_throughput": 3270.778916734288, "output_throughput": 454.2700042710521, "total_throughput": 3725.0489210053397, "mean_e2e_latency_ms": 16465.93430028588, "median_e2e_latency_ms": 16535.203597974032, "std_e2e_latency_ms": 8297.40087689733, "p90_e2e_latency_ms": 27848.679464147426, "p99_e2e_latency_ms": 34066.74822968196, "mean_ttft_ms": 1275.6713679729728, "median_ttft_ms": 316.7236299486831, "std_ttft_ms": 2237.0240657685317, "p99_ttft_ms": 8538.692513904534, "mean_tpot_ms": 31.521857190271305, "median_tpot_ms": 29.72083288174972, "std_tpot_ms": 17.82936716609273, "p99_tpot_ms": 77.68743860196824, "mean_itl_ms": 29.22327457401753, "median_itl_ms": 23.241017595864832, "std_itl_ms": 87.96533202479867, "p95_itl_ms": 24.558546382468194, "p99_itl_ms": 199.04860019916651, "concurrency": 14.360757483822947, "accept_length": null, "max_output_tokens_per_s": 720.0, "max_concurrent_requests": 19}
+{"tag": "fa3", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 16, "sharegpt_output_len": null, "random_input_len": 8000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": {"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30001, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": null, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 398782077, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "fa3", "decode_attention_backend": null, "prefill_attention_backend": null, "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "status": "ready", "max_total_num_tokens": 1386225, "max_req_input_len": 202746, "internal_states": [{"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30001, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": 3500, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 398782077, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "fa3", "decode_attention_backend": "fa3", "prefill_attention_backend": "fa3", "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "use_mla_backend": true, "last_gen_throughput": 219.95093668460453, "memory_usage": {"weight": 56.33, "kvcache": 69.9, "token_capacity": 1386225, "graph": 2.36}, "effective_max_running_requests_per_dp": 3500}], "version": "0.0.0.dev1+g1e8e0cca2"}, "duration": 59.73137983889319, "completed": 80, "total_input_tokens": 300020, "total_input_text_tokens": 300020, "total_input_vision_tokens": 0, "total_output_tokens": 41669, "total_output_tokens_retokenized": 41662, "request_throughput": 1.3393295151689968, "input_throughput": 5022.82051426253, "output_throughput": 697.6065195947116, "total_throughput": 5720.427033857241, "mean_e2e_latency_ms": 10622.85218659672, "median_e2e_latency_ms": 11066.243920009583, "std_e2e_latency_ms": 5402.915248171756, "p90_e2e_latency_ms": 17654.418413643725, "p99_e2e_latency_ms": 20623.906321420785, "mean_ttft_ms": 451.66798550635576, "median_ttft_ms": 203.69619643315673, "std_ttft_ms": 607.1807883246723, "p99_ttft_ms": 2349.38725421438, "mean_tpot_ms": 20.256926417193917, "median_tpot_ms": 20.214214266303014, "std_tpot_ms": 4.371429622338443, "p99_tpot_ms": 35.06027535497477, "mean_itl_ms": 19.5660749396803, "median_itl_ms": 16.61551301367581, "std_itl_ms": 26.438658107068314, "p95_itl_ms": 17.541494616307315, "p99_itl_ms": 110.56301650125533, "concurrency": 14.227499468786501, "accept_length": null, "max_output_tokens_per_s": 992.0, "max_concurrent_requests": 20}
+{"tag": "flashmla", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 16, "sharegpt_output_len": null, "random_input_len": 8000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": {"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30002, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 64, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": null, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 379113072, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "flashmla", "decode_attention_backend": null, "prefill_attention_backend": null, "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "status": "ready", "max_total_num_tokens": 1386176, "max_req_input_len": 202746, "internal_states": [{"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30002, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 64, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": 3500, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 379113072, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "flashmla", "decode_attention_backend": "flashmla", "prefill_attention_backend": "flashmla", "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "use_mla_backend": true, "last_gen_throughput": 206.06980700252126, "memory_usage": {"weight": 56.33, "kvcache": 69.9, "token_capacity": 1386176, "graph": 1.21}, "effective_max_running_requests_per_dp": 3500}], "version": "0.0.0.dev1+g1e8e0cca2"}, "duration": 61.09140785806812, "completed": 80, "total_input_tokens": 300020, "total_input_text_tokens": 300020, "total_input_vision_tokens": 0, "total_output_tokens": 41669, "total_output_tokens_retokenized": 41654, "request_throughput": 1.3095131182090558, "input_throughput": 4911.001571563511, "output_throughput": 682.0762765331643, "total_throughput": 5593.077848096676, "mean_e2e_latency_ms": 10810.300631891005, "median_e2e_latency_ms": 11076.29925746005, "std_e2e_latency_ms": 5473.510942669237, "p90_e2e_latency_ms": 17724.158798204742, "p99_e2e_latency_ms": 21265.637267699916, "mean_ttft_ms": 537.9770307714352, "median_ttft_ms": 201.4428024413064, "std_ttft_ms": 779.0185306377831, "p99_ttft_ms": 2850.2713821735233, "mean_tpot_ms": 20.48185703899536, "median_tpot_ms": 20.238185403098377, "std_tpot_ms": 4.742075680759459, "p99_tpot_ms": 35.28982754592937, "mean_itl_ms": 19.764911403941884, "median_itl_ms": 16.86290360521525, "std_itl_ms": 27.32488423973038, "p95_itl_ms": 17.988642130512744, "p99_itl_ms": 103.51196154253557, "concurrency": 14.156230489244916, "accept_length": null, "max_output_tokens_per_s": 960.0, "max_concurrent_requests": 20}
+{"tag": "flashinfer", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 16, "sharegpt_output_len": null, "random_input_len": 8000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": {"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30003, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": null, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 253908945, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "flashinfer", "decode_attention_backend": null, "prefill_attention_backend": null, "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "status": "ready", "max_total_num_tokens": 1386222, "max_req_input_len": 202746, "internal_states": [{"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30003, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": 3500, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 253908945, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "flashinfer", "decode_attention_backend": "flashinfer", "prefill_attention_backend": "flashinfer", "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "use_mla_backend": true, "last_gen_throughput": 230.46929758756366, "memory_usage": {"weight": 56.33, "kvcache": 69.9, "token_capacity": 1386222, "graph": 1.71}, "effective_max_running_requests_per_dp": 3500}], "version": "0.0.0.dev1+g1e8e0cca2"}, "duration": 55.89285264792852, "completed": 80, "total_input_tokens": 300020, "total_input_text_tokens": 300020, "total_input_vision_tokens": 0, "total_output_tokens": 41669, "total_output_tokens_retokenized": 38512, "request_throughput": 1.431310019259948, "input_throughput": 5367.770399729619, "output_throughput": 745.5157149067846, "total_throughput": 6113.286114636404, "mean_e2e_latency_ms": 9957.88358815189, "median_e2e_latency_ms": 10363.739449880086, "std_e2e_latency_ms": 5054.131718005112, "p90_e2e_latency_ms": 16646.5964271687, "p99_e2e_latency_ms": 19347.230267827385, "mean_ttft_ms": 453.599761048099, "median_ttft_ms": 213.0170369055122, "std_ttft_ms": 603.1003102286579, "p99_ttft_ms": 2351.627971509006, "mean_tpot_ms": 18.996200609382797, "median_tpot_ms": 18.850297664384062, "std_tpot_ms": 4.495230873739031, "p99_tpot_ms": 34.1858196159775, "mean_itl_ms": 18.34537334671199, "median_itl_ms": 15.283621032722294, "std_itl_ms": 27.258801329256364, "p95_itl_ms": 16.486559237819165, "p99_itl_ms": 109.11040107021108, "concurrency": 14.252818550346, "accept_length": null, "max_output_tokens_per_s": 1056.0, "max_concurrent_requests": 20}
+{"tag": "lightllm-fa3", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 16, "sharegpt_output_len": null, "random_input_len": 8000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": null, "duration": 45.33212866913527, "completed": 80, "total_input_tokens": 300020, "total_input_text_tokens": 300020, "total_input_vision_tokens": 0, "total_output_tokens": 41669, "total_output_tokens_retokenized": 41658, "request_throughput": 1.7647527779666923, "input_throughput": 6618.264105569588, "output_throughput": 919.1935438136762, "total_throughput": 7537.457649383264, "mean_e2e_latency_ms": 8019.016220018965, "median_e2e_latency_ms": 8524.129228899255, "std_e2e_latency_ms": 4157.89820587007, "p90_e2e_latency_ms": 13101.259714760823, "p99_e2e_latency_ms": 15792.665833330246, "mean_ttft_ms": 181.4006418280769, "median_ttft_ms": 156.01945854723454, "std_ttft_ms": 135.5525971806731, "p99_ttft_ms": 734.0356547082774, "mean_tpot_ms": 15.553320464290449, "median_tpot_ms": 15.566042172658063, "std_tpot_ms": 2.911541114674202, "p99_tpot_ms": 26.047497648062944, "mean_itl_ms": 15.084295362151842, "median_itl_ms": 12.69083796069026, "std_itl_ms": 13.922606183367431, "p95_itl_ms": 15.18232161179185, "p99_itl_ms": 88.5000554844737, "concurrency": 14.151581150838433, "accept_length": null, "max_output_tokens_per_s": 1232.0, "max_concurrent_requests": 21}
+{"tag": "lightllm-flashinfer", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 16, "sharegpt_output_len": null, "random_input_len": 8000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": null, "duration": 45.5442786780186, "completed": 80, "total_input_tokens": 300020, "total_input_text_tokens": 300020, "total_input_vision_tokens": 0, "total_output_tokens": 41669, "total_output_tokens_retokenized": 41662, "request_throughput": 1.7565323751325772, "input_throughput": 6587.435539840947, "output_throughput": 914.911844242492, "total_throughput": 7502.3473840834395, "mean_e2e_latency_ms": 8048.459070990793, "median_e2e_latency_ms": 8570.676261559129, "std_e2e_latency_ms": 4178.828855557125, "p90_e2e_latency_ms": 13119.105676165787, "p99_e2e_latency_ms": 15836.62676514126, "mean_ttft_ms": 184.9294237967115, "median_ttft_ms": 149.6679214760661, "std_ttft_ms": 134.0790601278319, "p99_ttft_ms": 661.3055070326664, "mean_tpot_ms": 15.603147434568124, "median_tpot_ms": 15.65682640210949, "std_tpot_ms": 2.6834146850235157, "p99_tpot_ms": 25.564559472766067, "mean_itl_ms": 15.133441347685872, "median_itl_ms": 12.702923035249114, "std_itl_ms": 13.846450207312868, "p95_itl_ms": 15.275433706119653, "p99_itl_ms": 93.04464281536639, "concurrency": 14.137378928124793, "accept_length": null, "max_output_tokens_per_s": 1248.0, "max_concurrent_requests": 21}
+{"tag": "lightllm-triton", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 16, "sharegpt_output_len": null, "random_input_len": 8000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": null, "duration": 71.63242872082628, "completed": 80, "total_input_tokens": 300020, "total_input_text_tokens": 300020, "total_input_vision_tokens": 0, "total_output_tokens": 41669, "total_output_tokens_retokenized": 41654, "request_throughput": 1.1168126144624346, "input_throughput": 4188.326507387746, "output_throughput": 581.7058104004399, "total_throughput": 4770.0323177881855, "mean_e2e_latency_ms": 12452.863597802934, "median_e2e_latency_ms": 13016.981037915684, "std_e2e_latency_ms": 6482.27810358746, "p90_e2e_latency_ms": 20901.55612407253, "p99_e2e_latency_ms": 24121.33116378681, "mean_ttft_ms": 225.4112859343877, "median_ttft_ms": 181.95387651212513, "std_ttft_ms": 187.30133292464953, "p99_ttft_ms": 973.6838864628226, "mean_tpot_ms": 24.17535566961863, "median_tpot_ms": 24.092253083753874, "std_tpot_ms": 4.128153213237987, "p99_tpot_ms": 38.4425003147593, "mean_itl_ms": 23.532990351701816, "median_itl_ms": 20.95877891406417, "std_itl_ms": 17.612692753270967, "p95_itl_ms": 23.275281325913966, "p99_itl_ms": 105.4545730212703, "concurrency": 13.907515152206376, "accept_length": null, "max_output_tokens_per_s": 782.0, "max_concurrent_requests": 20}
diff --git a/lightllm/common/basemodel/attention/flashinfer/mla.py b/lightllm/common/basemodel/attention/flashinfer/mla.py
index 6c74b22e1..537dbee22 100644
--- a/lightllm/common/basemodel/attention/flashinfer/mla.py
+++ b/lightllm/common/basemodel/attention/flashinfer/mla.py
@@ -16,6 +16,8 @@ def __init__(self, model):
         self.qk_nope_head_dim = model.qk_nope_head_dim
         self.qk_rope_head_dim = model.qk_rope_head_dim
         self.kv_lora_rank = model.kv_lora_rank
+        # v_head_dim may differ from qk_nope_head_dim (e.g., GLM-4.7-Flash: v_head_dim=256, qk_nope_head_dim=192)
+        self.v_head_dim = getattr(model, "v_head_dim", self.qk_nope_head_dim)
         self.q_data_type = model.data_type
         self.kv_data_type = model.data_type
         self.workspace_buffer = torch.empty(256 * 1024 * 1024, dtype=torch.int8, device=get_current_device_id())
@@ -69,7 +71,7 @@ def init_state(self):
             num_qo_heads=self.backend.tp_q_head_num,
             num_kv_heads=self.backend.tp_q_head_num,
             head_dim_qk=self.backend.qk_nope_head_dim + self.backend.qk_rope_head_dim,
-            head_dim_vo=self.backend.qk_nope_head_dim,
+            head_dim_vo=self.backend.v_head_dim,  # Use v_head_dim, not qk_nope_head_dim
             q_data_type=self.backend.q_data_type,
             causal=True,
             sm_scale=self.backend.softmax_scale,
@@ -101,7 +103,8 @@ def _mla_prefill_att(
     ) -> torch.Tensor:
         self.backend: MlaFlashInferAttBackend = self.backend  # for typing
         k_nope, k_rope = k
-        o_tensor = alloc_func((q.shape[0], q.shape[1], k_nope.shape[2]), q.dtype, device="cuda")
+        # Output dimension is v_head_dim (from v.shape[-1]), not qk_nope_head_dim
+        o_tensor = alloc_func((q.shape[0], q.shape[1], v.shape[-1]), q.dtype, device="cuda")
         q_head_num = q.shape[1]
         k = torch.cat([k_nope, torch.repeat_interleave(k_rope, q_head_num, dim=-2)], dim=-1)
         self.prefill_wrapper.run(q, k, v, out=o_tensor)
diff --git a/lightllm/common/basemodel/attention/triton/mla.py b/lightllm/common/basemodel/attention/triton/mla.py
index 8288193ad..fbdae4012 100644
--- a/lightllm/common/basemodel/attention/triton/mla.py
+++ b/lightllm/common/basemodel/attention/triton/mla.py
@@ -44,7 +44,8 @@ def _mla_prefill_att(
 
         qk_rope_head_dim = 64
         q_nope, q_rope = q[:, :, :-qk_rope_head_dim], q[:, :, -qk_rope_head_dim:]
-        o_tensor = alloc_func(q_nope.shape, dtype=q_nope.dtype, device=q.device)
+        #  GLM-4.7-Flash ： v_head_dim != qk_nope_head_dim
+        o_tensor = alloc_func((q_nope.shape[0], q_nope.shape[1], v.shape[-1]), dtype=q_nope.dtype, device=q.device)
         k_nope, k_rope = k
         assert att_control.mla_prefill
         softmax_scale = att_control.mla_prefill_dict["softmax_scale"]
diff --git a/lightllm/common/basemodel/basemodel.py b/lightllm/common/basemodel/basemodel.py
index 26d51af3d..7967004a3 100755
--- a/lightllm/common/basemodel/basemodel.py
+++ b/lightllm/common/basemodel/basemodel.py
@@ -1028,6 +1028,7 @@ def _gen_special_model_input(self, token_num: int):
             "Deepseek3MTPModel" in str(self.__class__)
             or "Qwen3MOEMTPModel" in str(self.__class__)
             or "MistralMTPModel" in str(self.__class__)
+            or "Glm4MoeLiteMTPModel" in str(self.__class__)
         )
         if is_mtp_draft_model:
             special_model_input["mtp_draft_input_hiddens"] = torch.randn(
diff --git a/lightllm/common/basemodel/triton_kernel/att/decode_att/gqa/flash_decoding/gqa_flash_decoding_vsm.py b/lightllm/common/basemodel/triton_kernel/att/decode_att/gqa/flash_decoding/gqa_flash_decoding_vsm.py
index 6a9bb79c7..141587ff3 100644
--- a/lightllm/common/basemodel/triton_kernel/att/decode_att/gqa/flash_decoding/gqa_flash_decoding_vsm.py
+++ b/lightllm/common/basemodel/triton_kernel/att/decode_att/gqa/flash_decoding/gqa_flash_decoding_vsm.py
@@ -81,8 +81,11 @@ def _fwd_kernel_calcu_index_and_block_seq(
     vsm_count,
     batch_size,
     BLOCK_N: tl.constexpr,
+    MAX_BATCH_SIZE: tl.constexpr,
 ):
-    b_seq_len = tl.load(b_seq_len + tl.arange(0, 2048), mask=tl.arange(0, 2048) < batch_size, other=0)
+    b_seq_len = tl.load(
+        b_seq_len + tl.arange(0, MAX_BATCH_SIZE), mask=tl.arange(0, MAX_BATCH_SIZE) < batch_size, other=0
+    )
     total_token_num = tl.sum(b_seq_len)
 
     block_seq = tl.cdiv(total_token_num, vsm_count * 4)
@@ -93,9 +96,9 @@ def _fwd_kernel_calcu_index_and_block_seq(
     cumsum_seq_len = tl.cumsum(block_seq_len)
     batch_start_index = cumsum_seq_len - block_seq_len
     tl.store(
-        mid_o_batch_start_index + tl.arange(0, 2048),
+        mid_o_batch_start_index + tl.arange(0, MAX_BATCH_SIZE),
         batch_start_index,
-        mask=tl.arange(0, 2048) < batch_size,
+        mask=tl.arange(0, MAX_BATCH_SIZE) < batch_size,
     )
     tl.store(mid_o_decode_att_block_seq, block_seq)
 
@@ -455,7 +458,6 @@ def gqa_token_decode_attention_flash_decoding_vsm(
     )
 
     if not hasattr(infer_state, "decode_att_block_seq"):
-        assert batch_size <= 2048
         decode_att_block_seq = torch.empty(
             [
                 1,
@@ -477,6 +479,7 @@ def gqa_token_decode_attention_flash_decoding_vsm(
             num_vsm,
             batch_size,
             BLOCK_N=run_config["BLOCK_N"],
+            MAX_BATCH_SIZE=triton.next_power_of_2(batch_size),
             num_warps=4,
         )
 
diff --git a/lightllm/common/basemodel/triton_kernel/mla_att/decode_att/gqa_flash_decoding.py b/lightllm/common/basemodel/triton_kernel/mla_att/decode_att/gqa_flash_decoding.py
index 28839b5f5..063181d99 100644
--- a/lightllm/common/basemodel/triton_kernel/mla_att/decode_att/gqa_flash_decoding.py
+++ b/lightllm/common/basemodel/triton_kernel/mla_att/decode_att/gqa_flash_decoding.py
@@ -67,7 +67,6 @@ def gqa_token_decode_attention_flash_decoding(
     )
 
     if not hasattr(infer_state, "decode_att_block_seq"):
-        assert batch_size <= 2048
         decode_att_block_seq = torch.empty(
             [
                 1,
@@ -89,6 +88,7 @@ def gqa_token_decode_attention_flash_decoding(
             vsm_count,
             batch_size,
             BLOCK_N=BLOCK_N,
+            MAX_BATCH_SIZE=triton.next_power_of_2(batch_size),
             num_warps=4,
         )
 
@@ -134,8 +134,11 @@ def _fwd_kernel_calcu_index_and_block_seq(
     num_sm,
     batch_size,
     BLOCK_N: tl.constexpr,
+    MAX_BATCH_SIZE: tl.constexpr,
 ):
-    b_seq_len = tl.load(b_seq_len_ptr + tl.arange(0, 2048), mask=tl.arange(0, 2048) < batch_size, other=0)
+    b_seq_len = tl.load(
+        b_seq_len_ptr + tl.arange(0, MAX_BATCH_SIZE), mask=tl.arange(0, MAX_BATCH_SIZE) < batch_size, other=0
+    )
     total_token_num = tl.sum(b_seq_len)
 
     block_seq = tl.cast(total_token_num / (num_sm * 4), dtype=tl.int32) + 1
@@ -144,6 +147,10 @@ def _fwd_kernel_calcu_index_and_block_seq(
     block_seq_len = tl.cdiv(b_seq_len, block_seq)
     cumsum_seq_len = tl.cumsum(block_seq_len)
     batch_start_index = cumsum_seq_len - block_seq_len
-    tl.store(mid_o_batch_start_index_ptr + tl.arange(0, 2048), batch_start_index, mask=tl.arange(0, 2048) < batch_size)
+    tl.store(
+        mid_o_batch_start_index_ptr + tl.arange(0, MAX_BATCH_SIZE),
+        batch_start_index,
+        mask=tl.arange(0, MAX_BATCH_SIZE) < batch_size,
+    )
     tl.store(mid_o_decode_att_block_seq_ptr, block_seq)
     return
diff --git a/lightllm/common/basemodel/triton_kernel/mla_att/prefill_att/context_flashattention_nopad_with_v.py b/lightllm/common/basemodel/triton_kernel/mla_att/prefill_att/context_flashattention_nopad_with_v.py
index be0635182..d79020844 100644
--- a/lightllm/common/basemodel/triton_kernel/mla_att/prefill_att/context_flashattention_nopad_with_v.py
+++ b/lightllm/common/basemodel/triton_kernel/mla_att/prefill_att/context_flashattention_nopad_with_v.py
@@ -36,6 +36,9 @@ def _fwd_kernel_with_v(
     BLOCK_DMODEL: tl.constexpr,
     BLOCK_ROPE_DMODEL: tl.constexpr,
     BLOCK_N: tl.constexpr,
+    BLOCK_V_DMODEL: tl.constexpr,
+    ACTUAL_DMODEL: tl.constexpr,
+    ACTUAL_V_DMODEL: tl.constexpr,
 ):
     cur_batch = tl.program_id(0)
     cur_head = tl.program_id(1)
@@ -53,8 +56,13 @@ def _fwd_kernel_with_v(
     # initialize offsets
     offs_n = tl.arange(0, BLOCK_N)
     offs_d = tl.arange(0, BLOCK_DMODEL)
+    offs_v_d = tl.arange(0, BLOCK_V_DMODEL)
     offs_rope_d = tl.arange(0, BLOCK_ROPE_DMODEL)
     offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+
+    d_mask = offs_d < ACTUAL_DMODEL
+    v_d_mask = offs_v_d < ACTUAL_V_DMODEL
+
     off_q = (cur_batch_in_q_start_index + offs_m[:, None]) * stride_q_bs + cur_head * stride_q_h + offs_d[None, :]
     off_q_rope = (
         (cur_batch_in_q_start_index + offs_m[:, None]) * stride_q_rope_bs
@@ -63,9 +71,10 @@ def _fwd_kernel_with_v(
     )
     off_k = offs_n[None, :] * stride_k_bs + cur_k_head * stride_k_h + offs_d[:, None]
     off_k_rope = offs_n[None, :] * stride_k_rope_bs + offs_rope_d[:, None]
-    off_v = offs_n[:, None] * stride_vbs + cur_k_head * stride_vh + offs_d[None, :]
+    off_v = offs_n[:, None] * stride_vbs + cur_k_head * stride_vh + offs_v_d[None, :]
 
-    q = tl.load(Q_nope + off_q, mask=offs_m[:, None] < cur_batch_seq_len, other=0.0)
+    q_mask = (offs_m[:, None] < cur_batch_seq_len) & d_mask[None, :]
+    q = tl.load(Q_nope + off_q, mask=q_mask, other=0.0)
     q_rope = tl.load(Q_rope + off_q_rope, mask=offs_m[:, None] < cur_batch_seq_len, other=0.0)
 
     k_ptrs = K_nope + off_k
@@ -75,7 +84,7 @@ def _fwd_kernel_with_v(
     # initialize pointer to m and l
     m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
     l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
-    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, BLOCK_V_DMODEL], dtype=tl.float32)
 
     block_mask = tl.where(block_start_loc < cur_batch_seq_len, 1, 0)
     block_end_loc = tl.minimum((start_m + 1) * BLOCK_M + prompt_cache_len, cur_batch_seq_len + prompt_cache_len)
@@ -83,14 +92,16 @@ def _fwd_kernel_with_v(
     for start_n in range(0, block_mask * block_end_loc, BLOCK_N):
         start_n = tl.multiple_of(start_n, BLOCK_N)
         # -- compute qk ----
+        k_seq_mask = (start_n + offs_n[None, :]) < block_end_loc
+        k_mask = k_seq_mask & d_mask[:, None]
         k = tl.load(
             k_ptrs + (cur_batch_in_kv_start_index + start_n) * stride_k_bs,
-            mask=(start_n + offs_n[None, :]) < block_end_loc,
+            mask=k_mask,
             other=0.0,
         )
         k_rope = tl.load(
             k_rope_ptrs + (cur_batch_in_kv_start_index + start_n) * stride_k_rope_bs,
-            mask=(start_n + offs_n[None, :]) < block_end_loc,
+            mask=k_seq_mask,
             other=0.0,
         )
 
@@ -112,9 +123,11 @@ def _fwd_kernel_with_v(
         # -- update output accumulator --
         acc = acc * alpha[:, None]
         # update acc
+        v_seq_mask = (start_n + offs_n[:, None]) < block_end_loc
+        v_mask = v_seq_mask & v_d_mask[None, :]
         v = tl.load(
             v_ptrs + (cur_batch_in_kv_start_index + start_n) * stride_vbs,
-            mask=(start_n + offs_n[:, None]) < block_end_loc,
+            mask=v_mask,
             other=0.0,
         )
         p = p.to(v.dtype)
@@ -124,9 +137,10 @@ def _fwd_kernel_with_v(
 
     acc = acc / l_i[:, None]
     # initialize pointers to output
-    off_o = (cur_batch_in_q_start_index + offs_m[:, None]) * stride_obs + cur_head * stride_oh + offs_d[None, :]
+    off_o = (cur_batch_in_q_start_index + offs_m[:, None]) * stride_obs + cur_head * stride_oh + offs_v_d[None, :]
     out_ptrs = Out + off_o
-    tl.store(out_ptrs, acc, mask=offs_m[:, None] < cur_batch_seq_len)
+    o_mask = (offs_m[:, None] < cur_batch_seq_len) & v_d_mask[None, :]
+    tl.store(out_ptrs, acc, mask=o_mask)
     return
 
 
@@ -149,13 +163,14 @@ def context_attention_fwd_with_v(
     BLOCK = 128 if not is_tesla() else 64
     q_nope_dim = q_nope.shape[-1]
     q_rope_dim = q_rope.shape[-1]
+    v_dim = v.shape[-1]
     assert q_nope_dim == k_nope.shape[-1]
     assert q_rope_dim == k_rope.shape[-1]
-    assert q_nope_dim in {16, 32, 64, 128, 256, 512}
-    assert q_rope_dim in {16, 32, 64, 128, 256}
-    assert q_nope_dim == v.shape[-1]
 
-    if q_nope_dim >= 512:
+    q_nope_dim_padded = triton.next_power_of_2(q_nope_dim)
+    v_dim_padded = triton.next_power_of_2(v_dim)
+
+    if q_nope_dim_padded >= 512 or v_dim_padded >= 512:
         BLOCK = 64 if not is_tesla() else 32
     else:
         BLOCK = 128 if not is_tesla() else 64
@@ -167,7 +182,7 @@ def context_attention_fwd_with_v(
     batch, head = b_seq_len.shape[0], q_nope.shape[1]
 
     grid = (batch, head, triton.cdiv(max_input_len, BLOCK))  # batch, head,
-    num_warps = 4 if q_nope_dim <= 64 else 8
+    num_warps = 4 if q_nope_dim_padded <= 64 else 8
 
     _fwd_kernel_with_v[grid](
         q_nope,
@@ -194,9 +209,12 @@ def context_attention_fwd_with_v(
         o.stride(1),
         b_prompt_cache_len=b_prompt_cache_len,
         BLOCK_M=BLOCK,
-        BLOCK_DMODEL=q_nope_dim,
+        BLOCK_DMODEL=q_nope_dim_padded,
         BLOCK_ROPE_DMODEL=q_rope_dim,
         BLOCK_N=BLOCK,
+        BLOCK_V_DMODEL=v_dim_padded,
+        ACTUAL_DMODEL=q_nope_dim,
+        ACTUAL_V_DMODEL=v_dim,
         num_warps=num_warps,
         num_stages=1,
     )
diff --git a/lightllm/common/fused_moe/grouped_topk.py b/lightllm/common/fused_moe/grouped_topk.py
index fb0323cd4..2687adf14 100644
--- a/lightllm/common/fused_moe/grouped_topk.py
+++ b/lightllm/common/fused_moe/grouped_topk.py
@@ -227,7 +227,7 @@ def triton_grouped_topk(
 
     scores_buffer = torch.empty((token_num, total_expert_num), dtype=dtype, device="cuda")
     out_topk_weights = torch.empty((token_num, topk), dtype=torch.float32, device="cuda")
-    out_topk_ids = torch.empty((token_num, topk), dtype=torch.long, device="cuda")
+    out_topk_ids = torch.empty((token_num, topk), dtype=torch.int32, device="cuda")
 
     assert total_expert_num % num_expert_group == 0
 
diff --git a/lightllm/common/fused_moe/topk_select.py b/lightllm/common/fused_moe/topk_select.py
index 5206800ef..a51ab3d03 100644
--- a/lightllm/common/fused_moe/topk_select.py
+++ b/lightllm/common/fused_moe/topk_select.py
@@ -196,10 +196,12 @@ def select_experts(
                 scoring_func=scoring_func,
             )
         else:
-            group_score_topk_num = 1
-            # for deepseek v3
-            if topk_group == 4 and num_expert_group == 8 and top_k == 8:
+            if correction_bias is not None:
                 group_score_topk_num = 2
+            elif topk_group == 4 and num_expert_group == 8 and top_k == 8:
+                group_score_topk_num = 2
+            else:
+                group_score_topk_num = 1
 
             topk_weights, topk_ids = triton_grouped_topk(
                 hidden_states=hidden_states,
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_H200/grouped_matmul:v1/{K=1536,N=2048,expert_num=64,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=false}_NVIDIA_H200.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_H200/grouped_matmul:v1/{K=1536,N=2048,expert_num=64,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=false}_NVIDIA_H200.json
new file mode 100644
index 000000000..deb97363c
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_H200/grouped_matmul:v1/{K=1536,N=2048,expert_num=64,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=false}_NVIDIA_H200.json
@@ -0,0 +1,110 @@
+{
+  "1024": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 1,
+    "NEED_TRANS": false,
+    "num_stages": 2,
+    "num_warps": 8
+  },
+  "128": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "GROUP_SIZE_M": 1,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "16384": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 1,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "256": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 16,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "32": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 1,
+    "NEED_TRANS": false,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "4": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "GROUP_SIZE_M": 1,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "400": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 1,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "4096": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 1,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 8
+  },
+  "512": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 1,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "64": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "GROUP_SIZE_M": 16,
+    "NEED_TRANS": false,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "65536": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 1,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "8192": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 1,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 4
+  }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_H200/grouped_matmul:v1/{K=2048,N=3072,expert_num=64,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=4,use_fp8_w8a8=false}_NVIDIA_H200.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_H200/grouped_matmul:v1/{K=2048,N=3072,expert_num=64,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=4,use_fp8_w8a8=false}_NVIDIA_H200.json
new file mode 100644
index 000000000..a6c93c3f6
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_H200/grouped_matmul:v1/{K=2048,N=3072,expert_num=64,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=4,use_fp8_w8a8=false}_NVIDIA_H200.json
@@ -0,0 +1,110 @@
+{
+  "1": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "GROUP_SIZE_M": 1,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "100": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 1,
+    "NEED_TRANS": false,
+    "num_stages": 2,
+    "num_warps": 8
+  },
+  "1024": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 1,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 8
+  },
+  "128": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 1,
+    "NEED_TRANS": false,
+    "num_stages": 2,
+    "num_warps": 8
+  },
+  "16": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 1,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "16384": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 1,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "2048": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 1,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "256": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 1,
+    "NEED_TRANS": false,
+    "num_stages": 2,
+    "num_warps": 4
+  },
+  "32": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 1,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "4096": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 1,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "64": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 1,
+    "NEED_TRANS": false,
+    "num_stages": 2,
+    "num_warps": 8
+  },
+  "8": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 1,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 4
+  }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_H200/moe_align_fused:v1/{topk_num=4}_NVIDIA_H200.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_H200/moe_align_fused:v1/{topk_num=4}_NVIDIA_H200.json
new file mode 100644
index 000000000..0f0c175b9
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_H200/moe_align_fused:v1/{topk_num=4}_NVIDIA_H200.json
@@ -0,0 +1,50 @@
+{
+  "1": {
+    "BLOCK_SIZE": 128,
+    "num_warps": 8
+  },
+  "100": {
+    "BLOCK_SIZE": 128,
+    "num_warps": 8
+  },
+  "1024": {
+    "BLOCK_SIZE": 256,
+    "num_warps": 8
+  },
+  "128": {
+    "BLOCK_SIZE": 128,
+    "num_warps": 4
+  },
+  "16": {
+    "BLOCK_SIZE": 128,
+    "num_warps": 2
+  },
+  "16384": {
+    "BLOCK_SIZE": 256,
+    "num_warps": 8
+  },
+  "2048": {
+    "BLOCK_SIZE": 128,
+    "num_warps": 4
+  },
+  "256": {
+    "BLOCK_SIZE": 128,
+    "num_warps": 4
+  },
+  "32": {
+    "BLOCK_SIZE": 128,
+    "num_warps": 4
+  },
+  "4096": {
+    "BLOCK_SIZE": 128,
+    "num_warps": 4
+  },
+  "64": {
+    "BLOCK_SIZE": 128,
+    "num_warps": 4
+  },
+  "8": {
+    "BLOCK_SIZE": 128,
+    "num_warps": 8
+  }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_H200/moe_sum_reduce:v1/{hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=4}_NVIDIA_H200.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_H200/moe_sum_reduce:v1/{hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=4}_NVIDIA_H200.json
new file mode 100644
index 000000000..c6c3d54ff
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_H200/moe_sum_reduce:v1/{hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=4}_NVIDIA_H200.json
@@ -0,0 +1,74 @@
+{
+  "1": {
+    "BLOCK_DIM": 64,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 4,
+    "num_warps": 4
+  },
+  "100": {
+    "BLOCK_DIM": 1024,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 1,
+    "num_warps": 8
+  },
+  "1024": {
+    "BLOCK_DIM": 1024,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 1,
+    "num_warps": 8
+  },
+  "128": {
+    "BLOCK_DIM": 512,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 4,
+    "num_warps": 1
+  },
+  "16": {
+    "BLOCK_DIM": 256,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 1,
+    "num_warps": 8
+  },
+  "16384": {
+    "BLOCK_DIM": 1024,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 4,
+    "num_warps": 4
+  },
+  "2048": {
+    "BLOCK_DIM": 1024,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 1,
+    "num_warps": 1
+  },
+  "256": {
+    "BLOCK_DIM": 512,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 1,
+    "num_warps": 4
+  },
+  "32": {
+    "BLOCK_DIM": 256,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 1,
+    "num_warps": 2
+  },
+  "4096": {
+    "BLOCK_DIM": 1024,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 1,
+    "num_warps": 8
+  },
+  "64": {
+    "BLOCK_DIM": 512,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 4,
+    "num_warps": 16
+  },
+  "8": {
+    "BLOCK_DIM": 64,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 2,
+    "num_warps": 4
+  }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_H200/rotary_emb_fwd:v1/{HEAD_DIM=64,K_HEAD_NUM=1,Q_HEAD_NUM=20,dtype=torch.bfloat16}_NVIDIA_H200.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_H200/rotary_emb_fwd:v1/{HEAD_DIM=64,K_HEAD_NUM=1,Q_HEAD_NUM=20,dtype=torch.bfloat16}_NVIDIA_H200.json
new file mode 100644
index 000000000..5601eab76
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_H200/rotary_emb_fwd:v1/{HEAD_DIM=64,K_HEAD_NUM=1,Q_HEAD_NUM=20,dtype=torch.bfloat16}_NVIDIA_H200.json
@@ -0,0 +1,74 @@
+{
+  "1": {
+    "BLOCK_SEQ": 32,
+    "HEAD_PARALLEL_NUM": 16,
+    "num_stages": 5,
+    "num_warps": 8
+  },
+  "100": {
+    "BLOCK_SEQ": 1,
+    "HEAD_PARALLEL_NUM": 16,
+    "num_stages": 3,
+    "num_warps": 1
+  },
+  "1024": {
+    "BLOCK_SEQ": 1,
+    "HEAD_PARALLEL_NUM": 2,
+    "num_stages": 5,
+    "num_warps": 1
+  },
+  "128": {
+    "BLOCK_SEQ": 1,
+    "HEAD_PARALLEL_NUM": 8,
+    "num_stages": 5,
+    "num_warps": 1
+  },
+  "16": {
+    "BLOCK_SEQ": 1,
+    "HEAD_PARALLEL_NUM": 16,
+    "num_stages": 3,
+    "num_warps": 2
+  },
+  "16384": {
+    "BLOCK_SEQ": 8,
+    "HEAD_PARALLEL_NUM": 2,
+    "num_stages": 5,
+    "num_warps": 1
+  },
+  "2048": {
+    "BLOCK_SEQ": 1,
+    "HEAD_PARALLEL_NUM": 2,
+    "num_stages": 5,
+    "num_warps": 1
+  },
+  "256": {
+    "BLOCK_SEQ": 1,
+    "HEAD_PARALLEL_NUM": 8,
+    "num_stages": 1,
+    "num_warps": 1
+  },
+  "32": {
+    "BLOCK_SEQ": 1,
+    "HEAD_PARALLEL_NUM": 16,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "4096": {
+    "BLOCK_SEQ": 2,
+    "HEAD_PARALLEL_NUM": 2,
+    "num_stages": 3,
+    "num_warps": 1
+  },
+  "64": {
+    "BLOCK_SEQ": 1,
+    "HEAD_PARALLEL_NUM": 16,
+    "num_stages": 1,
+    "num_warps": 1
+  },
+  "8": {
+    "BLOCK_SEQ": 1,
+    "HEAD_PARALLEL_NUM": 16,
+    "num_stages": 4,
+    "num_warps": 1
+  }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_H200/silu_and_mul_fwd:v1/{N=10240,out_dtype=torch.bfloat16}_NVIDIA_H200.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_H200/silu_and_mul_fwd:v1/{N=10240,out_dtype=torch.bfloat16}_NVIDIA_H200.json
new file mode 100644
index 000000000..b82f25e17
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_H200/silu_and_mul_fwd:v1/{N=10240,out_dtype=torch.bfloat16}_NVIDIA_H200.json
@@ -0,0 +1,74 @@
+{
+  "1": {
+    "BLOCK_M": 1,
+    "BLOCK_N": 256,
+    "NUM_STAGES": 4,
+    "num_warps": 1
+  },
+  "100": {
+    "BLOCK_M": 8,
+    "BLOCK_N": 256,
+    "NUM_STAGES": 4,
+    "num_warps": 1
+  },
+  "1024": {
+    "BLOCK_M": 8,
+    "BLOCK_N": 128,
+    "NUM_STAGES": 4,
+    "num_warps": 1
+  },
+  "128": {
+    "BLOCK_M": 8,
+    "BLOCK_N": 128,
+    "NUM_STAGES": 4,
+    "num_warps": 1
+  },
+  "16": {
+    "BLOCK_M": 1,
+    "BLOCK_N": 256,
+    "NUM_STAGES": 1,
+    "num_warps": 1
+  },
+  "16384": {
+    "BLOCK_M": 8,
+    "BLOCK_N": 128,
+    "NUM_STAGES": 4,
+    "num_warps": 1
+  },
+  "2048": {
+    "BLOCK_M": 32,
+    "BLOCK_N": 256,
+    "NUM_STAGES": 4,
+    "num_warps": 1
+  },
+  "256": {
+    "BLOCK_M": 8,
+    "BLOCK_N": 256,
+    "NUM_STAGES": 4,
+    "num_warps": 1
+  },
+  "32": {
+    "BLOCK_M": 1,
+    "BLOCK_N": 256,
+    "NUM_STAGES": 2,
+    "num_warps": 1
+  },
+  "4096": {
+    "BLOCK_M": 8,
+    "BLOCK_N": 128,
+    "NUM_STAGES": 4,
+    "num_warps": 1
+  },
+  "64": {
+    "BLOCK_M": 8,
+    "BLOCK_N": 128,
+    "NUM_STAGES": 4,
+    "num_warps": 1
+  },
+  "8": {
+    "BLOCK_M": 1,
+    "BLOCK_N": 256,
+    "NUM_STAGES": 4,
+    "num_warps": 1
+  }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_H200/silu_and_mul_fwd:v1/{N=1536,out_dtype=torch.bfloat16}_NVIDIA_H200.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_H200/silu_and_mul_fwd:v1/{N=1536,out_dtype=torch.bfloat16}_NVIDIA_H200.json
new file mode 100644
index 000000000..ab4644621
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_H200/silu_and_mul_fwd:v1/{N=1536,out_dtype=torch.bfloat16}_NVIDIA_H200.json
@@ -0,0 +1,104 @@
+{
+  "1": {
+    "BLOCK_M": 32,
+    "BLOCK_N": 128,
+    "NUM_STAGES": 2,
+    "num_warps": 4
+  },
+  "100": {
+    "BLOCK_M": 1,
+    "BLOCK_N": 256,
+    "NUM_STAGES": 2,
+    "num_warps": 1
+  },
+  "1024": {
+    "BLOCK_M": 8,
+    "BLOCK_N": 256,
+    "NUM_STAGES": 1,
+    "num_warps": 4
+  },
+  "128": {
+    "BLOCK_M": 1,
+    "BLOCK_N": 256,
+    "NUM_STAGES": 1,
+    "num_warps": 4
+  },
+  "16": {
+    "BLOCK_M": 1,
+    "BLOCK_N": 128,
+    "NUM_STAGES": 4,
+    "num_warps": 4
+  },
+  "16384": {
+    "BLOCK_M": 1,
+    "BLOCK_N": 256,
+    "NUM_STAGES": 4,
+    "num_warps": 1
+  },
+  "2048": {
+    "BLOCK_M": 8,
+    "BLOCK_N": 256,
+    "NUM_STAGES": 4,
+    "num_warps": 1
+  },
+  "256": {
+    "BLOCK_M": 1,
+    "BLOCK_N": 256,
+    "NUM_STAGES": 2,
+    "num_warps": 4
+  },
+  "32": {
+    "BLOCK_M": 1,
+    "BLOCK_N": 256,
+    "NUM_STAGES": 4,
+    "num_warps": 8
+  },
+  "4": {
+    "BLOCK_M": 1,
+    "BLOCK_N": 32,
+    "NUM_STAGES": 1,
+    "num_warps": 4
+  },
+  "400": {
+    "BLOCK_M": 1,
+    "BLOCK_N": 256,
+    "NUM_STAGES": 1,
+    "num_warps": 4
+  },
+  "4096": {
+    "BLOCK_M": 8,
+    "BLOCK_N": 256,
+    "NUM_STAGES": 4,
+    "num_warps": 1
+  },
+  "512": {
+    "BLOCK_M": 8,
+    "BLOCK_N": 256,
+    "NUM_STAGES": 4,
+    "num_warps": 4
+  },
+  "64": {
+    "BLOCK_M": 1,
+    "BLOCK_N": 256,
+    "NUM_STAGES": 1,
+    "num_warps": 1
+  },
+  "65536": {
+    "BLOCK_M": 1,
+    "BLOCK_N": 128,
+    "NUM_STAGES": 4,
+    "num_warps": 1
+  },
+  "8": {
+    "BLOCK_M": 1,
+    "BLOCK_N": 64,
+    "NUM_STAGES": 1,
+    "num_warps": 4
+  },
+  "8192": {
+    "BLOCK_M": 8,
+    "BLOCK_N": 256,
+    "NUM_STAGES": 4,
+    "num_warps": 1
+  }
+}
\ No newline at end of file
diff --git a/lightllm/models/__init__.py b/lightllm/models/__init__.py
index 539b32dec..095f73679 100644
--- a/lightllm/models/__init__.py
+++ b/lightllm/models/__init__.py
@@ -18,6 +18,7 @@
 from lightllm.models.gemma_2b.model import Gemma_2bTpPartModel
 from lightllm.models.phi3.model import Phi3TpPartModel
 from lightllm.models.deepseek2.model import Deepseek2TpPartModel
+from lightllm.models.glm4_moe_lite.model import Glm4MoeLiteTpPartModel
 from lightllm.models.internvl.model import (
     InternVLLlamaTpPartModel,
     InternVLPhi3TpPartModel,
diff --git a/lightllm/models/deepseek2/model.py b/lightllm/models/deepseek2/model.py
index f0739a8a8..c7f13bb63 100644
--- a/lightllm/models/deepseek2/model.py
+++ b/lightllm/models/deepseek2/model.py
@@ -43,6 +43,8 @@ def _init_some_value(self):
         self.qk_rope_head_dim = self.config["qk_rope_head_dim"]
         self.q_lora_rank = self.config["q_lora_rank"]
         self.kv_lora_rank = self.config["kv_lora_rank"]
+        # v_head_dim defaults to qk_nope_head_dim for DeepSeek-V2, but GLM-4.7-Flash has different value
+        self.v_head_dim = self.config.get("v_head_dim", self.qk_nope_head_dim)
         self.head_dim_ = self.kv_lora_rank + self.qk_rope_head_dim
 
     def _init_custom(self):
diff --git a/lightllm/models/glm4_moe_lite/__init__.py b/lightllm/models/glm4_moe_lite/__init__.py
new file mode 100644
index 000000000..b00657090
--- /dev/null
+++ b/lightllm/models/glm4_moe_lite/__init__.py
@@ -0,0 +1,4 @@
+from lightllm.models.glm4_moe_lite.model import Glm4MoeLiteTpPartModel
+from lightllm.models.glm4_moe_lite.infer_struct import Glm4MoeLiteInferStateInfo
+
+__all__ = ["Glm4MoeLiteTpPartModel", "Glm4MoeLiteInferStateInfo"]
diff --git a/lightllm/models/glm4_moe_lite/infer_struct.py b/lightllm/models/glm4_moe_lite/infer_struct.py
new file mode 100644
index 000000000..92c350abc
--- /dev/null
+++ b/lightllm/models/glm4_moe_lite/infer_struct.py
@@ -0,0 +1,12 @@
+from lightllm.models.deepseek2.infer_struct import Deepseek2InferStateInfo
+
+
+class Glm4MoeLiteInferStateInfo(Deepseek2InferStateInfo):
+    """Inference state for GLM-4.7-Flash (glm4_moe_lite architecture).
+
+    Inherits from Deepseek2InferStateInfo as GLM-4.7-Flash uses the same
+    MLA (Multi-Head Latent Attention) mechanism as DeepSeek-V2/V3.
+    """
+
+    def __init__(self):
+        super().__init__()
diff --git a/lightllm/models/glm4_moe_lite/layer_infer/__init__.py b/lightllm/models/glm4_moe_lite/layer_infer/__init__.py
new file mode 100644
index 000000000..a95580535
--- /dev/null
+++ b/lightllm/models/glm4_moe_lite/layer_infer/__init__.py
@@ -0,0 +1,3 @@
+from lightllm.models.glm4_moe_lite.layer_infer.transformer_layer_infer import Glm4MoeLiteTransformerLayerInfer
+
+__all__ = ["Glm4MoeLiteTransformerLayerInfer"]
diff --git a/lightllm/models/glm4_moe_lite/layer_infer/transformer_layer_infer.py b/lightllm/models/glm4_moe_lite/layer_infer/transformer_layer_infer.py
new file mode 100644
index 000000000..bcea872fd
--- /dev/null
+++ b/lightllm/models/glm4_moe_lite/layer_infer/transformer_layer_infer.py
@@ -0,0 +1,96 @@
+import os
+import torch
+import torch.distributed as dist
+import triton
+from functools import partial
+from lightllm.models.deepseek2.layer_infer.transformer_layer_infer import Deepseek2TransformerLayerInfer
+from lightllm.models.llama.layer_infer.transformer_layer_infer import LlamaTransformerLayerInfer
+from lightllm.distributed.communication_op import reduce_scatter_tensor
+
+
+class Glm4MoeLiteTransformerLayerInfer(Deepseek2TransformerLayerInfer):
+    def __init__(self, layer_num, network_config):
+        self._glm4_layer_num = layer_num
+        self._glm4_first_k_dense = network_config.get("first_k_dense_replace", 0)
+        self._glm4_has_routed_experts = network_config.get("n_routed_experts") is not None
+        super().__init__(layer_num, network_config)
+
+    @property
+    def is_moe(self):
+        return self._glm4_has_routed_experts and self._glm4_layer_num >= self._glm4_first_k_dense
+
+    @is_moe.setter
+    def is_moe(self, value):
+        pass
+
+    def _bind_ffn(self):
+        if self.is_moe:
+            moe_mode = os.environ.get("MOE_MODE", "TP")
+            if moe_mode == "EP":
+                self._ffn = partial(Deepseek2TransformerLayerInfer._moe_ffn_edp, self)
+                self._tpsp_ffn = self._tpsp_ffn_ep
+            else:
+                self._ffn = partial(Glm4MoeLiteTransformerLayerInfer._moe_ffn, self)
+                self._tpsp_ffn = self._tpsp_ffn_tp
+        else:
+            self._ffn = partial(LlamaTransformerLayerInfer._ffn, self)
+            self._tpsp_ffn = self._tpsp_ffn_tp
+
+    def _get_o(self, input: torch.Tensor, infer_state, layer_weight) -> torch.Tensor:
+        if input.shape[2] == self.kv_lora_rank:
+            input = layer_weight.v_b_proj_.bmm(input.transpose(0, 1)).transpose(0, 1)
+        o_tensor = layer_weight.o_weight_.mm(input.reshape(-1, self.tp_q_head_num_ * self.v_head_dim))
+        return o_tensor
+
+    def _tpsp_get_o(self, input, infer_state, layer_weight) -> torch.Tensor:
+        if infer_state.need_dp_prefill_balance:
+            input = infer_state._all_to_all_balance_get(data=input)
+
+        if input.shape[2] == self.kv_lora_rank:
+            input = layer_weight.v_b_proj_.bmm(input.transpose(0, 1)).transpose(0, 1)
+
+        input = input.reshape(-1, self.tp_q_head_num_ * self.v_head_dim)
+        dest_size = triton.cdiv(input.shape[0], self.tp_world_size_) * self.tp_world_size_
+        o_tensor = self.alloc_tensor((dest_size, self.embed_dim_), dtype=input.dtype, device=input.device)
+        layer_weight.o_weight_.mm(input, out=o_tensor[0 : len(infer_state.input_ids), :])
+        e_o_tensor = o_tensor[len(infer_state.input_ids) :, :]
+        if e_o_tensor.shape[0] > 0:
+            e_o_tensor.fill_(0)
+
+        if self.tp_world_size_ > 1:
+            sp_token_num = o_tensor.shape[0] // self.tp_world_size_
+            reduce_o_tensor = self.alloc_tensor((sp_token_num, self.embed_dim_), dtype=input.dtype, device=input.device)
+            reduce_scatter_tensor(
+                output=reduce_o_tensor,
+                input=o_tensor,
+                op=dist.ReduceOp.SUM,
+                group=infer_state.dist_group,
+                async_op=False,
+            )
+            o_tensor = reduce_o_tensor
+
+        return o_tensor
+
+    def _moe_ffn(self, input, infer_state, layer_weight):
+        hidden_states = input.view(-1, self.embed_dim_)
+        num_tokens, hidden_dim = hidden_states.shape
+
+        if self.n_shared_experts is not None and layer_weight.num_fused_shared_experts == 0:
+            shared_output = LlamaTransformerLayerInfer._ffn(self, hidden_states, infer_state, layer_weight)
+
+        router_logits = layer_weight.moe_gate.mm(hidden_states.to(torch.float32))
+
+        layer_weight.experts.experts(
+            hidden_states,
+            router_logits=router_logits,
+            top_k=self.num_experts_per_tok,
+            renormalize=self.norm_topk_prob,
+            use_grouped_topk=self.n_group,
+            topk_group=self.topk_group,
+            num_expert_group=self.n_group,
+        )
+
+        if self.n_shared_experts is not None and layer_weight.num_fused_shared_experts == 0:
+            hidden_states.add_(shared_output)
+
+        return hidden_states.view(num_tokens, hidden_dim)
diff --git a/lightllm/models/glm4_moe_lite/layer_weights/__init__.py b/lightllm/models/glm4_moe_lite/layer_weights/__init__.py
new file mode 100644
index 000000000..1fd5e36f8
--- /dev/null
+++ b/lightllm/models/glm4_moe_lite/layer_weights/__init__.py
@@ -0,0 +1,3 @@
+from lightllm.models.glm4_moe_lite.layer_weights.transformer_layer_weight import Glm4MoeLiteTransformerLayerWeight
+
+__all__ = ["Glm4MoeLiteTransformerLayerWeight"]
diff --git a/lightllm/models/glm4_moe_lite/layer_weights/transformer_layer_weight.py b/lightllm/models/glm4_moe_lite/layer_weights/transformer_layer_weight.py
new file mode 100644
index 000000000..b4d41fd47
--- /dev/null
+++ b/lightllm/models/glm4_moe_lite/layer_weights/transformer_layer_weight.py
@@ -0,0 +1,113 @@
+import os
+import torch
+from lightllm.models.deepseek2.layer_weights.transformer_layer_weight import Deepseek2TransformerLayerWeight
+from lightllm.common.basemodel.layer_weights.meta_weights import ROWMMWeight
+from lightllm.common.basemodel.layer_weights.meta_weights.fused_moe_weight_tp import create_tp_moe_wegiht_obj
+from lightllm.common.basemodel.layer_weights.meta_weights.fused_moe_weight_ep import FusedMoeWeightEP
+
+
+class Glm4MoeLiteTransformerLayerWeight(Deepseek2TransformerLayerWeight):
+    def __init__(self, layer_num, data_type, network_config, quant_cfg=None):
+        super().__init__(layer_num, data_type, network_config, quant_cfg)
+
+    def _parse_config(self):
+        from lightllm.common.basemodel import TransformerLayerWeight
+
+        TransformerLayerWeight._parse_config(self)
+
+        self.is_moe = self.network_config_.get(
+            "n_routed_experts"
+        ) is not None and self.layer_num_ >= self.network_config_.get("first_k_dense_replace", 0)
+
+        self.tp_q_head_num_ = self.network_config_["num_attention_heads"]
+        self.tp_q_head_num_ = self.tp_q_head_num_ // self.tp_world_size_
+        self.n_routed_experts = self.network_config_.get("n_routed_experts")
+        self.q_lora_rank = self.network_config_.get("q_lora_rank")
+        self.qk_nope_head_dim = self.network_config_["qk_nope_head_dim"]
+        self.qk_rope_head_dim = self.network_config_["qk_rope_head_dim"]
+        self.v_head_dim = self.network_config_["v_head_dim"]
+        self.num_attention_heads = self.network_config_["num_attention_heads"]
+        self.kv_lora_rank = self.network_config_["kv_lora_rank"]
+
+        from lightllm.utils.envs_utils import get_env_start_args
+        import os
+
+        self.num_fused_shared_experts = 0
+        if get_env_start_args().enable_fused_shared_experts and self.is_moe:
+            moe_mode = os.getenv("MOE_MODE", "TP")
+            assert moe_mode == "TP"
+            self.num_fused_shared_experts = self.network_config_.get("n_shared_experts", 0)
+
+    def _load_kb(self, kv_b_proj_):
+        kv_dim = self.qk_nope_head_dim + self.v_head_dim
+        k_b_proj_ = kv_b_proj_.view(self.num_attention_heads, kv_dim, self.kv_lora_rank)[:, : self.qk_nope_head_dim, :]
+        return k_b_proj_.contiguous().to(kv_b_proj_.dtype)
+
+    def _load_kb_scale(self, kv_b_proj_, block_size):
+        kv_dim = self.qk_nope_head_dim + self.v_head_dim
+        k_b_proj_scale_ = kv_b_proj_.view(
+            self.num_attention_heads, kv_dim // block_size, self.kv_lora_rank // block_size
+        )[:, : self.qk_nope_head_dim // block_size, :]
+        return k_b_proj_scale_.contiguous().to(kv_b_proj_.dtype)
+
+    def _load_vb(self, kv_b_proj_):
+        kv_dim = self.qk_nope_head_dim + self.v_head_dim
+        v_b_proj_ = kv_b_proj_.T.view(self.kv_lora_rank, self.num_attention_heads, kv_dim)[
+            :, :, self.qk_nope_head_dim :
+        ].transpose(0, 1)
+        return v_b_proj_.contiguous().to(kv_b_proj_.dtype)
+
+    def _load_vb_scale(self, kv_b_proj_scale_, block_size):
+        kv_dim = self.qk_nope_head_dim + self.v_head_dim
+        v_b_proj_scale_ = kv_b_proj_scale_.T.view(
+            self.kv_lora_rank // block_size,
+            self.num_attention_heads,
+            kv_dim // block_size,
+        )[:, :, self.qk_nope_head_dim // block_size :].transpose(0, 1)
+        return v_b_proj_scale_.contiguous().to(kv_b_proj_scale_.dtype)
+
+    def _init_moe(self):
+        moe_intermediate_size = self.network_config_["moe_intermediate_size"]
+
+        self.moe_gate = ROWMMWeight(
+            weight_names=f"model.layers.{self.layer_num_}.mlp.gate.weight",
+            data_type=torch.float32,
+            layer_num=self.layer_num_,
+            name="moe_gate",
+            tp_rank=0,
+            tp_world_size=1,
+        )
+
+        if self.num_fused_shared_experts == 0:
+            self._load_mlp(f"model.layers.{self.layer_num_}.mlp.shared_experts")
+
+        moe_mode = os.getenv("MOE_MODE", "TP")
+        assert moe_mode in ["EP", "TP"]
+        if moe_mode == "TP":
+            self.experts = create_tp_moe_wegiht_obj(
+                gate_proj_name="gate_proj",
+                down_proj_name="down_proj",
+                up_proj_name="up_proj",
+                e_score_correction_bias_name=self.e_score_correction_bias_name,
+                weight_prefix=f"model.layers.{self.layer_num_}.mlp.experts",
+                n_routed_experts=self.n_routed_experts,
+                num_fused_shared_experts=self.num_fused_shared_experts,
+                split_inter_size=moe_intermediate_size // self.tp_world_size_,
+                data_type=self.data_type_,
+                network_config=self.network_config_,
+                layer_num=self.layer_num_,
+                quant_cfg=self.quant_cfg,
+            )
+        else:
+            self.experts = FusedMoeWeightEP(
+                gate_proj_name="gate_proj",
+                down_proj_name="down_proj",
+                up_proj_name="up_proj",
+                e_score_correction_bias_name=self.e_score_correction_bias_name,
+                weight_prefix=f"model.layers.{self.layer_num_}.mlp.experts",
+                n_routed_experts=self.n_routed_experts,
+                data_type=self.data_type_,
+                network_config=self.network_config_,
+                layer_num=self.layer_num_,
+                quant_cfg=self.quant_cfg,
+            )
diff --git a/lightllm/models/glm4_moe_lite/model.py b/lightllm/models/glm4_moe_lite/model.py
new file mode 100644
index 000000000..a5970ab59
--- /dev/null
+++ b/lightllm/models/glm4_moe_lite/model.py
@@ -0,0 +1,74 @@
+import torch
+from lightllm.models.registry import ModelRegistry
+from lightllm.models.deepseek2.model import Deepseek2TpPartModel
+from lightllm.models.glm4_moe_lite.layer_infer.transformer_layer_infer import Glm4MoeLiteTransformerLayerInfer
+from lightllm.models.glm4_moe_lite.layer_weights.transformer_layer_weight import Glm4MoeLiteTransformerLayerWeight
+from lightllm.models.glm4_moe_lite.infer_struct import Glm4MoeLiteInferStateInfo
+from lightllm.distributed.communication_op import dist_group_manager
+from lightllm.utils.log_utils import init_logger
+
+logger = init_logger(__name__)
+
+
+@ModelRegistry("glm4_moe_lite")
+class Glm4MoeLiteTpPartModel(Deepseek2TpPartModel):
+
+    transformer_weight_class = Glm4MoeLiteTransformerLayerWeight
+    transformer_layer_infer_class = Glm4MoeLiteTransformerLayerInfer
+    infer_state_class = Glm4MoeLiteInferStateInfo
+
+    def __init__(self, kvargs):
+        super().__init__(kvargs)
+
+    def _init_config(self):
+        super()._init_config()
+
+        if "moe_layer_freq" not in self.config and self.config.get("n_routed_experts"):
+            self.config["moe_layer_freq"] = 1
+
+        if "routed_scaling_factor" not in self.config:
+            self.config["routed_scaling_factor"] = 1.8
+
+        if "topk_method" not in self.config:
+            self.config["topk_method"] = "noaux_tc"
+
+        if "scoring_func" not in self.config:
+            self.config["scoring_func"] = "sigmoid"
+
+        logger.info(
+            f"GLM-4.7-Flash config: "
+            f"n_routed_experts={self.config.get('n_routed_experts')}, "
+            f"n_shared_experts={self.config.get('n_shared_experts')}, "
+            f"num_experts_per_tok={self.config.get('num_experts_per_tok')}, "
+            f"first_k_dense_replace={self.config.get('first_k_dense_replace')}, "
+            f"routed_scaling_factor={self.config.get('routed_scaling_factor')}, "
+            f"scoring_func={self.config.get('scoring_func')}"
+        )
+
+    def _init_custom(self):
+        self._init_to_get_yarn_rotary()
+        dist_group_manager.new_deepep_group(self.config["n_routed_experts"], self.config["hidden_size"])
+
+    def _init_to_get_yarn_rotary(self):
+        rope_scaling = self.config.get("rope_scaling")
+
+        if rope_scaling is None:
+            self._init_glm4_standard_rotary()
+        else:
+            super()._init_to_get_yarn_rotary()
+
+    def _init_glm4_standard_rotary(self):
+        rope_theta = self.config.get("rope_theta", 1000000.0)
+        qk_rope_head_dim = self.config.get("qk_rope_head_dim", 64)
+        max_position_embeddings = self.config.get("max_position_embeddings", 202752)
+
+        dim = qk_rope_head_dim
+
+        inv_freq = 1.0 / (rope_theta ** (torch.arange(0, dim, 2, device="cpu", dtype=torch.float32) / dim))
+
+        max_seq_len = max(max_position_embeddings, self.max_seq_length)
+        t = torch.arange(max_seq_len, device="cpu", dtype=torch.float32)
+        freqs = torch.outer(t, inv_freq)
+
+        self._cos_cached = torch.cos(freqs).to(self.data_type).cuda()
+        self._sin_cached = torch.sin(freqs).to(self.data_type).cuda()
diff --git a/lightllm/models/glm4_moe_lite_mtp/__init__.py b/lightllm/models/glm4_moe_lite_mtp/__init__.py
new file mode 100644
index 000000000..96b6659c8
--- /dev/null
+++ b/lightllm/models/glm4_moe_lite_mtp/__init__.py
@@ -0,0 +1,3 @@
+from lightllm.models.glm4_moe_lite_mtp.model import Glm4MoeLiteMTPModel
+
+__all__ = ["Glm4MoeLiteMTPModel"]
diff --git a/lightllm/models/glm4_moe_lite_mtp/layer_infer/__init__.py b/lightllm/models/glm4_moe_lite_mtp/layer_infer/__init__.py
new file mode 100644
index 000000000..e357bfa19
--- /dev/null
+++ b/lightllm/models/glm4_moe_lite_mtp/layer_infer/__init__.py
@@ -0,0 +1,3 @@
+from lightllm.models.glm4_moe_lite_mtp.layer_infer.pre_layer_infer import Glm4MoeLiteMTPPreLayerInfer
+
+__all__ = ["Glm4MoeLiteMTPPreLayerInfer"]
diff --git a/lightllm/models/glm4_moe_lite_mtp/layer_infer/pre_layer_infer.py b/lightllm/models/glm4_moe_lite_mtp/layer_infer/pre_layer_infer.py
new file mode 100644
index 000000000..6994d21f7
--- /dev/null
+++ b/lightllm/models/glm4_moe_lite_mtp/layer_infer/pre_layer_infer.py
@@ -0,0 +1,82 @@
+import torch
+
+from lightllm.models.glm4_moe_lite_mtp.layer_weights.pre_and_post_layer_weight import (
+    Glm4MoeLiteMTPPreAndPostLayerWeight,
+)
+from lightllm.models.glm4_moe_lite.infer_struct import Glm4MoeLiteInferStateInfo
+from lightllm.models.llama.layer_infer.pre_layer_infer import LlamaPreLayerInfer
+
+
+class Glm4MoeLiteMTPPreLayerInfer(LlamaPreLayerInfer):
+    def __init__(self, network_config):
+        super().__init__(network_config)
+        self.eps_ = network_config["rms_norm_eps"]
+        self.hidden_size = network_config["hidden_size"]
+
+    def _mtp_context_forward(
+        self,
+        input_embdings,
+        infer_state: Glm4MoeLiteInferStateInfo,
+        layer_weight: Glm4MoeLiteMTPPreAndPostLayerWeight,
+    ):
+        tgt_embdings = infer_state.mtp_draft_input_hiddens
+        assert (
+            input_embdings.shape[0] == tgt_embdings.shape[0]
+        ), f"shape {input_embdings.shape} != shape {tgt_embdings.shape}"
+
+        layer_weight.enorm_weight_.rmsnorm_forward(
+            input=input_embdings,
+            eps=self.eps_,
+            out=input_embdings,
+        )
+        layer_weight.hnorm_weight_.rmsnorm_forward(
+            input=tgt_embdings,
+            eps=self.eps_,
+            out=tgt_embdings,
+        )
+        cat_embdings = torch.cat((input_embdings, tgt_embdings), dim=-1)
+
+        ans_logics = layer_weight.eh_proj_weight_.mm(cat_embdings)
+        return ans_logics
+
+    def _mtp_token_forward(
+        self,
+        input_embdings,
+        infer_state: Glm4MoeLiteInferStateInfo,
+        layer_weight: Glm4MoeLiteMTPPreAndPostLayerWeight,
+    ):
+        tgt_embdings = infer_state.mtp_draft_input_hiddens
+        assert input_embdings.shape[0] == tgt_embdings.shape[0]
+
+        layer_weight.enorm_weight_.rmsnorm_forward(
+            input=input_embdings,
+            eps=self.eps_,
+            out=input_embdings,
+        )
+        layer_weight.hnorm_weight_.rmsnorm_forward(
+            input=tgt_embdings,
+            eps=self.eps_,
+            out=tgt_embdings,
+        )
+        cat_embdings = torch.cat((input_embdings, tgt_embdings), dim=-1)
+
+        ans_logics = layer_weight.eh_proj_weight_.mm(cat_embdings)
+        return ans_logics
+
+    def context_forward(
+        self,
+        input_ids,
+        infer_state: Glm4MoeLiteInferStateInfo,
+        layer_weight: Glm4MoeLiteMTPPreAndPostLayerWeight,
+    ):
+        input_embdings = super().context_forward(input_ids, infer_state, layer_weight)
+        return self._mtp_context_forward(input_embdings, infer_state, layer_weight)
+
+    def token_forward(
+        self,
+        input_ids,
+        infer_state: Glm4MoeLiteInferStateInfo,
+        layer_weight: Glm4MoeLiteMTPPreAndPostLayerWeight,
+    ):
+        input_embdings = super().token_forward(input_ids, infer_state, layer_weight)
+        return self._mtp_token_forward(input_embdings, infer_state, layer_weight)
diff --git a/lightllm/models/glm4_moe_lite_mtp/layer_weights/__init__.py b/lightllm/models/glm4_moe_lite_mtp/layer_weights/__init__.py
new file mode 100644
index 000000000..57fe578cf
--- /dev/null
+++ b/lightllm/models/glm4_moe_lite_mtp/layer_weights/__init__.py
@@ -0,0 +1,5 @@
+from lightllm.models.glm4_moe_lite_mtp.layer_weights.pre_and_post_layer_weight import (
+    Glm4MoeLiteMTPPreAndPostLayerWeight,
+)
+
+__all__ = ["Glm4MoeLiteMTPPreAndPostLayerWeight"]
diff --git a/lightllm/models/glm4_moe_lite_mtp/layer_weights/pre_and_post_layer_weight.py b/lightllm/models/glm4_moe_lite_mtp/layer_weights/pre_and_post_layer_weight.py
new file mode 100644
index 000000000..a84ada72f
--- /dev/null
+++ b/lightllm/models/glm4_moe_lite_mtp/layer_weights/pre_and_post_layer_weight.py
@@ -0,0 +1,43 @@
+from lightllm.common.basemodel import PreAndPostLayerWeight
+from lightllm.common.basemodel.layer_weights.meta_weights import (
+    EmbeddingWeight,
+    LMHeadWeight,
+    NoTpNormWeight,
+    ROWMMWeight,
+)
+
+
+class Glm4MoeLiteMTPPreAndPostLayerWeight(PreAndPostLayerWeight):
+    def __init__(self, data_type, network_config):
+        super().__init__(data_type, network_config)
+
+        mtp_layer_idx = network_config["num_hidden_layers"]
+
+        self.eh_proj_weight_ = ROWMMWeight(
+            weight_names=f"model.layers.{mtp_layer_idx}.eh_proj.weight",
+            data_type=self.data_type_,
+            name="eh_proj",
+            tp_rank=0,
+            tp_world_size=1,
+        )
+
+        self.enorm_weight_ = NoTpNormWeight(
+            weight_name=f"model.layers.{mtp_layer_idx}.enorm.weight",
+            data_type=self.data_type_,
+            bias_name=None,
+        )
+
+        self.hnorm_weight_ = NoTpNormWeight(
+            weight_name=f"model.layers.{mtp_layer_idx}.hnorm.weight",
+            data_type=self.data_type_,
+            bias_name=None,
+        )
+
+        self.final_norm_weight_ = NoTpNormWeight(
+            weight_name=f"model.layers.{mtp_layer_idx}.shared_head.norm.weight",
+            data_type=self.data_type_,
+            bias_name=None,
+        )
+
+        self.wte_weight_: EmbeddingWeight = None
+        self.lm_head_weight_: LMHeadWeight = None
diff --git a/lightllm/models/glm4_moe_lite_mtp/model.py b/lightllm/models/glm4_moe_lite_mtp/model.py
new file mode 100644
index 000000000..1e7e68e22
--- /dev/null
+++ b/lightllm/models/glm4_moe_lite_mtp/model.py
@@ -0,0 +1,88 @@
+from typing import List
+from lightllm.models.glm4_moe_lite.model import Glm4MoeLiteTpPartModel
+from lightllm.models.glm4_moe_lite_mtp.layer_infer.pre_layer_infer import Glm4MoeLiteMTPPreLayerInfer
+from lightllm.models.glm4_moe_lite_mtp.layer_weights.pre_and_post_layer_weight import (
+    Glm4MoeLiteMTPPreAndPostLayerWeight,
+)
+from lightllm.common.basemodel import TpPartBaseModel
+from lightllm.common.basemodel.basemodel import load_hf_weights
+
+
+class Glm4MoeLiteMTPModel(Glm4MoeLiteTpPartModel):
+
+    pre_and_post_weight_class = Glm4MoeLiteMTPPreAndPostLayerWeight
+    pre_layer_infer_class = Glm4MoeLiteMTPPreLayerInfer
+
+    def __init__(self, kvargs: dict):
+        self._pre_init(kvargs)
+        super().__init__(kvargs)
+
+    def _pre_init(self, kvargs: dict):
+        self.main_model: TpPartBaseModel = kvargs.pop("main_model")
+        self.mtp_previous_draft_models: List[TpPartBaseModel] = kvargs.pop("mtp_previous_draft_models")
+
+    def _init_custom(self):
+        self._cos_cached = self.main_model._cos_cached
+        self._sin_cached = self.main_model._sin_cached
+
+    def _init_req_manager(self):
+        self.req_manager = self.main_model.req_manager
+
+    def _init_mem_manager(self):
+        self.mem_manager = self.main_model.mem_manager
+
+    def _init_weights(self, start_layer_index=None):
+        assert start_layer_index is None
+
+        mtp_layer_start = self.config["num_hidden_layers"]
+        num_mtp_layers = self.config.get("num_nextn_predict_layers", 1)
+
+        self.pre_post_weight = self.pre_and_post_weight_class(self.data_type, network_config=self.config)
+
+        self.trans_layers_weight = [
+            self.transformer_weight_class(
+                i,
+                self.data_type,
+                network_config=self.config,
+                quant_cfg=self.quant_cfg,
+            )
+            for i in range(mtp_layer_start, mtp_layer_start + num_mtp_layers)
+        ]
+
+        load_hf_weights(
+            self.data_type,
+            weight_dir=self.weight_dir_,
+            pre_post_layer=self.pre_post_weight,
+            transformer_layer_list=self.trans_layers_weight,
+            weight_dict=self.weight_dict,
+        )
+
+        self.pre_post_weight.verify_load()
+        [weight.verify_load() for weight in self.trans_layers_weight]
+
+        self.pre_post_weight.wte_weight_ = self.main_model.pre_post_weight.wte_weight_
+        self.pre_post_weight.lm_head_weight_ = self.main_model.pre_post_weight.lm_head_weight_
+
+    def _init_infer_layer(self, start_layer_index=None):
+        assert start_layer_index is None
+
+        self.pre_infer = self.pre_layer_infer_class(network_config=self.config)
+        self.post_infer = self.post_layer_infer_class(network_config=self.config)
+
+        total_pre_layers_num = len(self.main_model.layers_infer)
+        total_pre_layers_num += sum(
+            [len(previous_model.layers_infer) for previous_model in self.mtp_previous_draft_models]
+        )
+
+        num_mtp_layers = self.config.get("num_nextn_predict_layers", 1)
+        self.layers_infer = [
+            self.transformer_layer_infer_class(i, network_config=self.config)
+            for i in range(total_pre_layers_num, total_pre_layers_num + num_mtp_layers)
+        ]
+
+    def _init_some_value(self):
+        super()._init_some_value()
+        self.layers_num = self.config.get("num_nextn_predict_layers", 1)
+
+    def autotune_layers(self):
+        return self.config.get("num_nextn_predict_layers", 1)
diff --git a/lightllm/server/api_cli.py b/lightllm/server/api_cli.py
index 44cc38822..296cea6b4 100644
--- a/lightllm/server/api_cli.py
+++ b/lightllm/server/api_cli.py
@@ -119,7 +119,7 @@ def make_argument_parser() -> argparse.ArgumentParser:
     parser.add_argument(
         "--batch_max_tokens",
         type=int,
-        default=None,
+        default=16384,
         help="max tokens num for new cat batch, it control prefill batch size to Preventing OOM",
     )
     parser.add_argument(
@@ -128,7 +128,7 @@ def make_argument_parser() -> argparse.ArgumentParser:
     parser.add_argument(
         "--tool_call_parser",
         type=str,
-        choices=["qwen25", "llama3", "mistral", "deepseekv3", "qwen", "deepseekv31"],
+        choices=["qwen25", "llama3", "mistral", "deepseekv3", "qwen", "deepseekv31", "glm47", "kimi_k2"],
         default=None,
         help="tool call parser type",
     )
@@ -259,7 +259,7 @@ def make_argument_parser() -> argparse.ArgumentParser:
     )
     parser.add_argument("--disable_dynamic_prompt_cache", action="store_true", help="disable dynamic prompt cache")
 
-    parser.add_argument("--chunked_prefill_size", type=int, default=4096, help="chunked prefill size")
+    parser.add_argument("--chunked_prefill_size", type=int, default=8192, help="chunked prefill size")
     parser.add_argument("--disable_chunked_prefill", action="store_true", help="whether to disable chunked prefill")
     parser.add_argument("--diverse_mode", action="store_true", help="diversity generation mode")
     parser.add_argument("--token_healing_mode", action="store_true", help="code model infer mode")
diff --git a/lightllm/server/function_call_parser.py b/lightllm/server/function_call_parser.py
index 1620cff13..9214715b1 100644
--- a/lightllm/server/function_call_parser.py
+++ b/lightllm/server/function_call_parser.py
@@ -241,7 +241,7 @@ def parse_streaming_increment(self, new_text: str, tools: List[Tool]) -> Streami
                 if start_idx >= len(current_text):
                     return StreamingParseResult()
 
-                (obj, end_idx) = _partial_json_loads(current_text[start_idx:], flags)
+                obj, end_idx = _partial_json_loads(current_text[start_idx:], flags)
 
                 is_current_complete = _is_complete_json(current_text[start_idx : start_idx + end_idx])
 
@@ -1173,6 +1173,276 @@ def parse_streaming_increment(self, new_text: str, tools: List[Tool]) -> Streami
             return StreamingParseResult(normal_text=current_text)
 
 
+class Glm47Detector(BaseFormatDetector):
+    """
+    Detector for GLM-4.7/GLM-4.7-Flash model function call format.
+
+    The GLM-4.7 format uses an XML-style envelope with arg_key/arg_value pairs
+    instead of JSON arguments.
+
+    Format Structure:
+    ```
+    <tool_call>function_name
+    <arg_key>param1</arg_key>
+    <arg_value>value1</arg_value>
+    <arg_key>param2</arg_key>
+    <arg_value>value2</arg_value>
+    </tool_call>
+    ```
+
+    Example:
+    ```
+    <tool_call>tool_brave_web_search_post
+    <arg_key>query</arg_key>
+    <arg_value>test search</arg_value>
+    <arg_key>count</arg_key>
+    <arg_value>5</arg_value>
+    </tool_call>
+    ```
+
+    Key Components:
+    - Tool Call Tags: `<tool_call>` and `</tool_call>` wrap each individual call
+    - Function Name: Appears on the first line after `<tool_call>`
+    - Arguments: Pairs of `<arg_key>name</arg_key>` and `<arg_value>value</arg_value>`
+
+    Reference: https://github.com/vllm-project/vllm/blob/main/vllm/tool_parsers/glm4_moe_tool_parser.py
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.bot_token = "<tool_call>"
+        self.eot_token = "</tool_call>"
+        self.tool_call_separator = "\n"
+
+        # Regex patterns for parsing GLM-4.7 tool calls
+        # Match complete tool call blocks
+        self.func_call_regex = re.compile(r"<tool_call>.*?</tool_call>", re.DOTALL)
+        # Extract function name and arguments from a tool call block
+        # Function name can be followed by newline OR directly by <arg_key>
+        # Pattern: <tool_call>function_name(\n|<arg_key>)...
+        self.func_detail_regex = re.compile(
+            r"<tool_call>([^<\n]+?)(?:\n|(?=<arg_key>)|(?=</tool_call>))(.*?)</tool_call>", re.DOTALL
+        )
+        # Extract arg_key/arg_value pairs
+        self.func_arg_regex = re.compile(r"<arg_key>(.*?)</arg_key>\s*<arg_value>(.*?)</arg_value>", re.DOTALL)
+
+        self._last_arguments = ""
+        self._normal_text_buffer = ""
+
+    def has_tool_call(self, text: str) -> bool:
+        """Check if the text contains a GLM-4.7 format tool call."""
+        return self.bot_token in text
+
+    def _parse_xml_arguments(self, arg_text: str) -> dict:
+        """
+        Parse XML-style arguments into a dictionary.
+
+        Args:
+            arg_text: The text containing <arg_key>/<arg_value> pairs
+
+        Returns:
+            Dictionary of argument name to value
+        """
+        if not arg_text:
+            return {}
+
+        args = {}
+        matches = self.func_arg_regex.findall(arg_text)
+        for key, value in matches:
+            key = key.strip()
+            value = value.strip()
+            # Try to parse value as JSON for complex types (arrays, objects, numbers, booleans)
+            try:
+                parsed_value = json.loads(value)
+                args[key] = parsed_value
+            except (json.JSONDecodeError, ValueError):
+                # Keep as string if not valid JSON
+                args[key] = value
+        return args
+
+    def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
+        """
+        One-time parsing: Detects and parses tool calls in the provided text.
+
+        :param text: The complete text to parse.
+        :param tools: List of available tools.
+        :return: StreamingParseResult with normal_text and parsed calls.
+        """
+        idx = text.find(self.bot_token)
+        normal_text = text[:idx].strip() if idx != -1 else text
+
+        if self.bot_token not in text:
+            return StreamingParseResult(normal_text=normal_text, calls=[])
+
+        tool_indices = self._get_tool_indices(tools)
+        calls = []
+
+        # Find all <tool_call>...</tool_call> blocks
+        match_result_list = self.func_call_regex.findall(text)
+
+        for match_result in match_result_list:
+            try:
+                # Extract function name and arguments
+                func_detail = self.func_detail_regex.search(match_result)
+                if not func_detail:
+                    logger.warning(f"Failed to parse GLM-4.7 tool call: {match_result}")
+                    continue
+
+                func_name = func_detail.group(1).strip()
+                arg_text = func_detail.group(2) if func_detail.group(2) else ""
+
+                # Validate function name
+                if func_name not in tool_indices:
+                    logger.warning(f"Model attempted to call undefined function: {func_name}")
+                    continue
+
+                # Parse XML arguments to JSON
+                func_args = self._parse_xml_arguments(arg_text)
+
+                calls.append(
+                    ToolCallItem(
+                        tool_index=tool_indices[func_name],
+                        name=func_name,
+                        parameters=json.dumps(func_args, ensure_ascii=False),
+                    )
+                )
+            except Exception as e:
+                logger.warning(f"Failed to parse GLM-4.7 tool call: {match_result}, error: {str(e)}")
+                continue
+
+        return StreamingParseResult(normal_text=normal_text, calls=calls)
+
+    def parse_streaming_increment(self, new_text: str, tools: List[Tool]) -> StreamingParseResult:
+        """
+        Streaming incremental parsing for GLM-4.7 tool calls.
+
+        This handles the streaming case where tool calls arrive incrementally.
+        """
+        self._buffer += new_text
+        current_text = self._buffer
+
+        # Check if we have a tool call starting
+        if not self.has_tool_call(current_text):
+            # Check for partial bot_token at the end
+            partial_len = self._ends_with_partial_token(current_text, self.bot_token)
+            if partial_len:
+                # Might be partial bot_token, keep buffering
+                return StreamingParseResult()
+
+            # No tool call, emit as normal text
+            self._buffer = ""
+            # Clean up any stray end tokens
+            if self.eot_token in new_text:
+                new_text = new_text.replace(self.eot_token, "")
+            return StreamingParseResult(normal_text=new_text)
+
+        # Build tool indices if not already built
+        if not hasattr(self, "_tool_indices"):
+            self._tool_indices = self._get_tool_indices(tools)
+
+        calls: List[ToolCallItem] = []
+
+        try:
+            # Check if we have a complete tool call
+            if self.eot_token in current_text:
+                # We have at least one complete tool call
+                # Parse all complete tool calls
+                result = self.detect_and_parse(current_text, tools)
+
+                # Find the end of the last complete tool call
+                last_end = current_text.rfind(self.eot_token)
+                if last_end != -1:
+                    remaining = current_text[last_end + len(self.eot_token) :]
+                    self._buffer = remaining.lstrip()
+                else:
+                    self._buffer = ""
+
+                # Reset state for next tool call
+                self.current_tool_id = -1
+                self.current_tool_name_sent = False
+                self._last_arguments = ""
+
+                return result
+
+            # We have a partial tool call - try to stream it
+            # Extract what we can from the partial tool call
+            tool_call_start = current_text.find(self.bot_token)
+            if tool_call_start == -1:
+                return StreamingParseResult()
+
+            # Get content after <tool_call>
+            content_after_start = current_text[tool_call_start + len(self.bot_token) :]
+
+            # Try to extract function name (first line after <tool_call>)
+            newline_pos = content_after_start.find("\n")
+            if newline_pos == -1:
+                # Still waiting for function name to complete
+                return StreamingParseResult()
+
+            func_name = content_after_start[:newline_pos].strip()
+
+            # Initialize state if this is the first tool call
+            if self.current_tool_id == -1:
+                self.current_tool_id = 0
+                self.prev_tool_call_arr = []
+                self.streamed_args_for_tool = [""]
+
+            # Ensure we have enough entries
+            while len(self.prev_tool_call_arr) <= self.current_tool_id:
+                self.prev_tool_call_arr.append({})
+            while len(self.streamed_args_for_tool) <= self.current_tool_id:
+                self.streamed_args_for_tool.append("")
+
+            # Check if function name is valid
+            if func_name and func_name in self._tool_indices:
+                if not self.current_tool_name_sent:
+                    # Send function name first
+                    calls.append(
+                        ToolCallItem(
+                            tool_index=self.current_tool_id,
+                            name=func_name,
+                            parameters="",
+                        )
+                    )
+                    self.current_tool_name_sent = True
+                    self.prev_tool_call_arr[self.current_tool_id] = {
+                        "name": func_name,
+                        "arguments": {},
+                    }
+                else:
+                    # Stream arguments incrementally
+                    arg_text = content_after_start[newline_pos + 1 :]
+                    current_args = self._parse_xml_arguments(arg_text)
+
+                    if current_args:
+                        current_args_json = json.dumps(current_args, ensure_ascii=False)
+                        prev_args = self.prev_tool_call_arr[self.current_tool_id].get("arguments", {})
+                        prev_args_json = json.dumps(prev_args, ensure_ascii=False) if prev_args else ""
+
+                        if current_args_json != prev_args_json:
+                            # Calculate the diff
+                            sent = len(self.streamed_args_for_tool[self.current_tool_id])
+                            argument_diff = current_args_json[sent:]
+
+                            if argument_diff:
+                                calls.append(
+                                    ToolCallItem(
+                                        tool_index=self.current_tool_id,
+                                        name=None,
+                                        parameters=argument_diff,
+                                    )
+                                )
+                                self.streamed_args_for_tool[self.current_tool_id] += argument_diff
+
+                            self.prev_tool_call_arr[self.current_tool_id]["arguments"] = current_args
+
+            return StreamingParseResult(normal_text="", calls=calls)
+
+        except Exception as e:
+            logger.error(f"Error in GLM-4.7 parse_streaming_increment: {e}")
+            return StreamingParseResult(normal_text="", calls=calls)
+
+
 class FunctionCallParser:
     """
     Parser for function/tool calls in model outputs.
@@ -1185,6 +1455,7 @@ class FunctionCallParser:
     ToolCallParserEnum: Dict[str, Type[BaseFormatDetector]] = {
         "deepseekv3": DeepSeekV3Detector,
         "deepseekv31": DeepSeekV31Detector,
+        "glm47": Glm47Detector,
         "kimi_k2": KimiK2Detector,
         "llama3": Llama32Detector,
         "mistral": MistralDetector,
diff --git a/lightllm/server/router/model_infer/mode_backend/base_backend.py b/lightllm/server/router/model_infer/mode_backend/base_backend.py
index 805c9b8e5..64310d6b0 100644
--- a/lightllm/server/router/model_infer/mode_backend/base_backend.py
+++ b/lightllm/server/router/model_infer/mode_backend/base_backend.py
@@ -37,6 +37,7 @@
 from lightllm.models.deepseek_mtp.model import Deepseek3MTPModel
 from lightllm.models.qwen3_moe_mtp.model import Qwen3MOEMTPModel
 from lightllm.models.mistral_mtp.model import MistralMTPModel
+from lightllm.models.glm4_moe_lite_mtp.model import Glm4MoeLiteMTPModel
 from lightllm.server.router.model_infer.mode_backend.generic_post_process import sample
 from lightllm.common.basemodel.triton_kernel.gather_token_id import scatter_token
 from lightllm.server.pd_io_struct import NIXLChunckedTransTaskRet
@@ -328,6 +329,9 @@ def init_mtp_draft_model(self, main_kvargs: dict):
             elif mtp_model_cfg["model_type"] == "mistral":
                 assert self.args.mtp_mode in ["vanilla_no_att", "eagle_no_att"]
                 self.draft_models.append(MistralMTPModel(mtp_model_kvargs))
+            elif mtp_model_cfg["model_type"] == "glm4_moe_lite":
+                assert self.args.mtp_mode in ["vanilla_with_att", "eagle_with_att"]
+                self.draft_models.append(Glm4MoeLiteMTPModel(mtp_model_kvargs))
             else:
                 assert False, f"error mtp mode {mtp_model_cfg['model_type']}"
 
diff --git a/test/eval/eval_bfcl.py b/test/eval/eval_bfcl.py
new file mode 100755
index 000000000..59b81c26d
--- /dev/null
+++ b/test/eval/eval_bfcl.py
@@ -0,0 +1,686 @@
+#!/usr/bin/env python3
+"""
+BFCL (Berkeley Function Calling Leaderboard) Evaluation Script for LightLLM
+
+This script evaluates function/tool calling capabilities on the BFCL benchmark.
+
+Usage:
+    # Start LightLLM server first:
+    python -m lightllm.server.api_server --model_dir /path/to/GLM-4.7-Flash --tp 1
+
+    # Run evaluation:
+    python eval_bfcl.py \
+        --model_name GLM-4.7-Flash \
+        --base_url http://localhost:8000/v1 \
+        --test_category simple
+
+Test Categories:
+    - simple: Single function calls (400 examples)
+    - multiple: Select one function from multiple options (200 examples)
+    - parallel: Multiple function calls in parallel (200 examples)
+    - parallel_multiple: Combination of parallel and multiple (200 examples)
+    - java: Java function calls (100 examples)
+    - javascript: JavaScript function calls (70 examples)
+    - irrelevance: Detect when no function should be called
+    - all: Run all categories
+
+Requirements:
+    pip install openai tqdm huggingface_hub
+"""
+
+import argparse
+import json
+import os
+import re
+import ast
+import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import List, Dict, Any, Optional, Tuple
+from dataclasses import dataclass
+from collections import defaultdict
+
+from tqdm import tqdm
+
+try:
+    from openai import OpenAI
+except ImportError:
+    print("Please install openai: pip install openai")
+    exit(1)
+
+try:
+    from huggingface_hub import hf_hub_download
+except ImportError:
+    print("Please install huggingface_hub: pip install huggingface_hub")
+    exit(1)
+
+
+# BFCL Dataset on HuggingFace
+BFCL_REPO = "gorilla-llm/Berkeley-Function-Calling-Leaderboard"
+
+# Test category mappings to filenames
+TEST_CATEGORIES = {
+    "simple": "BFCL_v3_simple.json",
+    "multiple": "BFCL_v3_multiple.json",
+    "parallel": "BFCL_v3_parallel.json",
+    "parallel_multiple": "BFCL_v3_parallel_multiple.json",
+    "java": "BFCL_v3_java.json",
+    "javascript": "BFCL_v3_javascript.json",
+    "irrelevance": "BFCL_v3_irrelevance.json",
+    "live_simple": "BFCL_v3_live_simple.json",
+    "live_multiple": "BFCL_v3_live_multiple.json",
+    "live_parallel": "BFCL_v3_live_parallel.json",
+    "live_parallel_multiple": "BFCL_v3_live_parallel_multiple.json",
+    "rest": "BFCL_v3_rest.json",
+    "sql": "BFCL_v3_sql.json",
+}
+
+# Possible answer files for ground truth
+ANSWER_FILES = {
+    "simple": "possible_answer/BFCL_v3_simple.json",
+    "multiple": "possible_answer/BFCL_v3_multiple.json",
+    "parallel": "possible_answer/BFCL_v3_parallel.json",
+    "parallel_multiple": "possible_answer/BFCL_v3_parallel_multiple.json",
+    "java": "possible_answer/BFCL_v3_java.json",
+    "javascript": "possible_answer/BFCL_v3_javascript.json",
+    "live_simple": "possible_answer/BFCL_v3_live_simple.json",
+    "live_multiple": "possible_answer/BFCL_v3_live_multiple.json",
+    "live_parallel": "possible_answer/BFCL_v3_live_parallel.json",
+    "live_parallel_multiple": "possible_answer/BFCL_v3_live_parallel_multiple.json",
+    "sql": "possible_answer/BFCL_v3_sql.json",
+}
+
+
+@dataclass
+class EvalResult:
+    """Result of a single evaluation."""
+
+    task_id: str
+    category: str
+    passed: bool
+    model_output: str
+    expected: Any
+    error: Optional[str] = None
+
+
+def download_bfcl_file(filename: str) -> str:
+    """Download a BFCL file from HuggingFace Hub."""
+    try:
+        local_path = hf_hub_download(
+            repo_id=BFCL_REPO,
+            filename=filename,
+            repo_type="dataset",
+        )
+        return local_path
+    except Exception as e:
+        print(f"Error downloading {filename}: {e}")
+        return None
+
+
+def load_jsonl_or_json(filepath: str) -> List[Dict[str, Any]]:
+    """Load data from JSON or JSONL file."""
+    data = []
+    with open(filepath, "r", encoding="utf-8") as f:
+        content = f.read().strip()
+        # Try as JSON array first
+        try:
+            data = json.loads(content)
+            if isinstance(data, dict):
+                data = [data]
+        except json.JSONDecodeError:
+            # Try as JSONL
+            f.seek(0)
+            for line in f:
+                line = line.strip()
+                if line:
+                    try:
+                        data.append(json.loads(line))
+                    except json.JSONDecodeError:
+                        continue
+    return data
+
+
+def load_bfcl_data(category: str, limit: Optional[int] = None) -> List[Dict[str, Any]]:
+    """Load BFCL dataset for a specific category."""
+    filename = TEST_CATEGORIES.get(category)
+    if not filename:
+        print(f"Unknown category: {category}")
+        return []
+
+    print(f"Downloading {filename} from HuggingFace...")
+    filepath = download_bfcl_file(filename)
+    if not filepath:
+        return []
+
+    print(f"Loading data from {filepath}")
+    data = load_jsonl_or_json(filepath)
+
+    # Also load ground truth answers if available
+    answer_file = ANSWER_FILES.get(category)
+    if answer_file:
+        print(f"Downloading answer file {answer_file}...")
+        answer_path = download_bfcl_file(answer_file)
+        if answer_path:
+            answers = load_jsonl_or_json(answer_path)
+            # Create a mapping from id to answer
+            answer_map = {}
+            for ans in answers:
+                ans_id = ans.get("id", "")
+                answer_map[ans_id] = ans.get("ground_truth", ans.get("result", []))
+
+            # Merge answers into data
+            for item in data:
+                item_id = item.get("id", "")
+                if item_id in answer_map:
+                    item["ground_truth"] = answer_map[item_id]
+
+    if limit:
+        data = data[:limit]
+
+    print(f"Loaded {len(data)} examples for category: {category}")
+    return data
+
+
+def fix_schema_types(schema: Any) -> Any:
+    """
+    Fix Python type names to JSON Schema types.
+    BFCL uses Python type names like 'dict', 'list' but JSON Schema needs 'object', 'array'.
+    """
+    if isinstance(schema, dict):
+        result = {}
+        for key, value in schema.items():
+            if key == "type" and isinstance(value, str):
+                # Map Python types to JSON Schema types
+                type_mapping = {
+                    "dict": "object",
+                    "list": "array",
+                    "str": "string",
+                    "int": "integer",
+                    "float": "number",
+                    "bool": "boolean",
+                    "NoneType": "null",
+                    "tuple": "array",
+                }
+                result[key] = type_mapping.get(value, value)
+            else:
+                result[key] = fix_schema_types(value)
+        return result
+    elif isinstance(schema, list):
+        return [fix_schema_types(item) for item in schema]
+    else:
+        return schema
+
+
+def convert_to_openai_tools(functions: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    """Convert BFCL function format to OpenAI tools format."""
+    tools = []
+    for func in functions:
+        if isinstance(func, str):
+            func = json.loads(func)
+
+        # Fix the parameters schema to use valid JSON Schema types
+        parameters = fix_schema_types(func.get("parameters", {}))
+
+        tool = {
+            "type": "function",
+            "function": {
+                "name": func.get("name", ""),
+                "description": func.get("description", ""),
+                "parameters": parameters,
+            },
+        }
+        tools.append(tool)
+    return tools
+
+
+def parse_function_call(response: str) -> List[Dict[str, Any]]:
+    """Parse function calls from model response."""
+    calls = []
+
+    # Try to parse as JSON array
+    try:
+        parsed = json.loads(response)
+        if isinstance(parsed, list):
+            return parsed
+        elif isinstance(parsed, dict):
+            return [parsed]
+    except json.JSONDecodeError:
+        pass
+
+    # Try to find function call patterns
+    # Pattern 1: function_name(args)
+    func_pattern = r"(\w+)\s*\((.*?)\)"
+    matches = re.findall(func_pattern, response, re.DOTALL)
+    for name, args_str in matches:
+        try:
+            # Try to parse args as Python dict/kwargs
+            args_str = args_str.strip()
+            if args_str:
+                # Convert to dict format
+                args = eval(f"dict({args_str})")
+            else:
+                args = {}
+            calls.append({"name": name, "arguments": args})
+        except:
+            pass
+
+    # Pattern 2: JSON-like tool_calls
+    tool_call_pattern = r'\{"name":\s*"([^"]+)",\s*"arguments":\s*(\{[^}]+\})\}'
+    matches = re.findall(tool_call_pattern, response)
+    for name, args_str in matches:
+        try:
+            args = json.loads(args_str)
+            calls.append({"name": name, "arguments": args})
+        except:
+            pass
+
+    return calls
+
+
+def extract_tool_calls_from_response(response) -> List[Dict[str, Any]]:
+    """Extract tool calls from OpenAI API response."""
+    calls = []
+
+    if hasattr(response, "choices") and response.choices:
+        choice = response.choices[0]
+        message = choice.message
+
+        # Check for tool_calls in response
+        if hasattr(message, "tool_calls") and message.tool_calls:
+            for tool_call in message.tool_calls:
+                func = tool_call.function
+                try:
+                    args = json.loads(func.arguments) if func.arguments else {}
+                except json.JSONDecodeError:
+                    args = {}
+                calls.append({"name": func.name, "arguments": args})
+
+        # Also check content for function calls (some models output in content)
+        if hasattr(message, "content") and message.content:
+            content_calls = parse_function_call(message.content)
+            if content_calls and not calls:
+                calls = content_calls
+
+    return calls
+
+
+def normalize_value(value: Any) -> Any:
+    """Normalize values for comparison."""
+    if isinstance(value, str):
+        # Try to parse as number
+        try:
+            return float(value)
+        except ValueError:
+            return value.lower().strip()
+    elif isinstance(value, bool):
+        return value
+    elif isinstance(value, (int, float)):
+        return float(value)
+    elif isinstance(value, list):
+        return [normalize_value(v) for v in value]
+    elif isinstance(value, dict):
+        return {k: normalize_value(v) for k, v in value.items()}
+    return value
+
+
+def value_matches_expected(predicted_value: Any, expected_values: Any) -> bool:
+    """
+    Check if predicted value matches expected value(s).
+    BFCL format: expected values can be a list of acceptable values.
+    """
+    # Normalize predicted value
+    pred_normalized = normalize_value(predicted_value)
+
+    # If expected is a list, check if predicted matches any item
+    if isinstance(expected_values, list):
+        for exp_val in expected_values:
+            exp_normalized = normalize_value(exp_val)
+            if pred_normalized == exp_normalized:
+                return True
+            # Also try string comparison for edge cases
+            if str(pred_normalized) == str(exp_normalized):
+                return True
+        return False
+    else:
+        exp_normalized = normalize_value(expected_values)
+        return pred_normalized == exp_normalized or str(pred_normalized) == str(exp_normalized)
+
+
+def compare_function_calls(
+    predicted: List[Dict[str, Any]], expected: List[Dict[str, Any]], strict: bool = False
+) -> Tuple[bool, str]:
+    """Compare predicted function calls with expected ones."""
+    if not predicted and not expected:
+        return True, ""
+
+    if len(predicted) != len(expected):
+        return False, f"Count mismatch: predicted {len(predicted)}, expected {len(expected)}"
+
+    # Sort by function name for comparison
+    pred_sorted = sorted(predicted, key=lambda x: x.get("name", ""))
+    exp_sorted = sorted(expected, key=lambda x: x.get("name", ""))
+
+    for pred, exp in zip(pred_sorted, exp_sorted):
+        pred_name = pred.get("name", "")
+        exp_name = exp.get("name", "")
+
+        if pred_name != exp_name:
+            return False, f"Function name mismatch: {pred_name} vs {exp_name}"
+
+        pred_args = pred.get("arguments", {})
+        exp_args = exp.get("arguments", {})
+
+        # Check required arguments match (BFCL format: values are lists of acceptable values)
+        for key, expected_values in exp_args.items():
+            if key not in pred_args:
+                return False, f"Missing argument {key} in {pred_name}"
+            if not value_matches_expected(pred_args[key], expected_values):
+                return False, f"Argument {key} mismatch in {pred_name}"
+
+    return True, ""
+
+
+def parse_expected_output(ground_truth: Any) -> List[Dict[str, Any]]:
+    """
+    Parse expected output from BFCL ground truth.
+
+    BFCL format: [{"func_name": {"arg1": [val1, val2], "arg2": [val3]}}]
+    Convert to: [{"name": "func_name", "arguments": {"arg1": [val1, val2], "arg2": [val3]}}]
+    """
+    if isinstance(ground_truth, str):
+        try:
+            ground_truth = json.loads(ground_truth)
+        except json.JSONDecodeError:
+            # Try parsing as Python literal
+            try:
+                ground_truth = ast.literal_eval(ground_truth)
+            except:
+                return []
+
+    if not ground_truth:
+        return []
+
+    # Ensure it's a list
+    if isinstance(ground_truth, dict):
+        ground_truth = [ground_truth]
+
+    result = []
+    for item in ground_truth:
+        if isinstance(item, dict):
+            # Check if it's already in standard format {"name": ..., "arguments": ...}
+            if "name" in item and "arguments" in item:
+                result.append(item)
+            else:
+                # BFCL format: {"func_name": {"arg1": [v1], "arg2": [v2]}}
+                for func_name, args in item.items():
+                    if isinstance(args, dict):
+                        result.append({"name": func_name, "arguments": args})
+                    else:
+                        # Handle edge case where args might not be a dict
+                        result.append({"name": func_name, "arguments": {}})
+
+    return result
+
+
+class BFCLEvaluator:
+    """BFCL Benchmark Evaluator using OpenAI-compatible API."""
+
+    def __init__(
+        self,
+        base_url: str,
+        model_name: str,
+        api_key: str = "EMPTY",
+        max_tokens: int = 1024,
+        temperature: float = 0.0,
+    ):
+        self.client = OpenAI(base_url=base_url, api_key=api_key)
+        self.model_name = model_name
+        self.max_tokens = max_tokens
+        self.temperature = temperature
+
+    def generate_response(
+        self, prompt: str, tools: List[Dict[str, Any]], system_prompt: Optional[str] = None
+    ) -> Tuple[Any, List[Dict[str, Any]]]:
+        """Generate response from the model with tool calling."""
+        messages = []
+
+        if system_prompt:
+            messages.append({"role": "system", "content": system_prompt})
+
+        messages.append({"role": "user", "content": prompt})
+
+        try:
+            response = self.client.chat.completions.create(
+                model=self.model_name,
+                messages=messages,
+                tools=tools if tools else None,
+                tool_choice="auto" if tools else None,
+                max_tokens=self.max_tokens,
+                temperature=self.temperature,
+            )
+            tool_calls = extract_tool_calls_from_response(response)
+            return response, tool_calls
+        except Exception as e:
+            print(f"API Error: {e}")
+            return None, []
+
+    def evaluate_single(self, item: Dict[str, Any], category: str) -> EvalResult:
+        """Evaluate a single BFCL example."""
+        task_id = item.get("id", "unknown")
+
+        # Extract question and functions
+        question = item.get("question", [[{"role": "user", "content": ""}]])
+        if isinstance(question, str):
+            prompt = question
+        elif isinstance(question, list) and question:
+            if isinstance(question[0], dict):
+                prompt = question[0].get("content", "")
+            elif isinstance(question[0], list) and question[0]:
+                prompt = question[0][0].get("content", "")
+            else:
+                prompt = str(question[0])
+        else:
+            prompt = str(question)
+
+        # Get functions
+        functions = item.get("function", [])
+        if isinstance(functions, str):
+            try:
+                functions = json.loads(functions)
+            except:
+                functions = []
+
+        if not isinstance(functions, list):
+            functions = [functions]
+
+        # Convert to OpenAI tools format
+        tools = convert_to_openai_tools(functions)
+
+        # Get expected output
+        ground_truth = item.get("ground_truth", item.get("answer", []))
+        expected = parse_expected_output(ground_truth)
+
+        # Generate response
+        system_prompt = (
+            "You are a helpful assistant that can use tools/functions to help answer questions. "
+            "When you need to call a function, use the provided tools."
+        )
+
+        response, predicted_calls = self.generate_response(prompt, tools, system_prompt)
+
+        if response is None:
+            return EvalResult(
+                task_id=task_id,
+                category=category,
+                passed=False,
+                model_output="",
+                expected=expected,
+                error="API call failed",
+            )
+
+        # For irrelevance category, model should NOT call any function
+        if "irrelevance" in category.lower():
+            passed = len(predicted_calls) == 0
+            error = "Model called function when it shouldn't" if not passed else None
+        else:
+            # Compare function calls
+            passed, error = compare_function_calls(predicted_calls, expected)
+
+        model_output = json.dumps(predicted_calls, indent=2) if predicted_calls else str(response)
+
+        return EvalResult(
+            task_id=task_id, category=category, passed=passed, model_output=model_output, expected=expected, error=error
+        )
+
+    def evaluate_category(self, category: str, limit: Optional[int] = None, num_workers: int = 4) -> Dict[str, Any]:
+        """Evaluate all examples in a category."""
+        print(f"\nLoading BFCL dataset for category: {category}")
+        data = load_bfcl_data(category, limit)
+
+        if not data:
+            print(f"No data found for category: {category}")
+            return {"category": category, "total": 0, "passed": 0, "accuracy": 0.0}
+
+        print(f"Loaded {len(data)} examples")
+
+        results = []
+
+        # Use ThreadPoolExecutor for concurrent evaluation
+        with ThreadPoolExecutor(max_workers=num_workers) as executor:
+            futures = {executor.submit(self.evaluate_single, item, category): item for item in data}
+
+            for future in tqdm(as_completed(futures), total=len(futures), desc=f"Evaluating {category}"):
+                try:
+                    result = future.result()
+                    results.append(result)
+                except Exception as e:
+                    print(f"Error evaluating: {e}")
+
+        # Calculate metrics
+        total = len(results)
+        passed = sum(1 for r in results if r.passed)
+        accuracy = passed / total * 100 if total > 0 else 0.0
+
+        # Collect errors for analysis
+        errors = defaultdict(int)
+        for r in results:
+            if not r.passed and r.error:
+                errors[r.error[:50]] += 1
+
+        return {
+            "category": category,
+            "total": total,
+            "passed": passed,
+            "accuracy": accuracy,
+            "results": results,
+            "error_summary": dict(errors),
+        }
+
+
+def main():
+    parser = argparse.ArgumentParser(description="BFCL Evaluation for LightLLM")
+    parser.add_argument("--model_name", type=str, required=True, help="Model name")
+    parser.add_argument(
+        "--base_url", type=str, default="http://localhost:8000/v1", help="OpenAI-compatible API base URL"
+    )
+    parser.add_argument("--api_key", type=str, default="EMPTY", help="API key (use EMPTY for local)")
+    parser.add_argument(
+        "--test_category",
+        type=str,
+        default="simple",
+        choices=list(TEST_CATEGORIES.keys()) + ["all"],
+        help="Test category to evaluate",
+    )
+    parser.add_argument("--limit", type=int, default=None, help="Limit number of examples (for testing)")
+    parser.add_argument("--num_workers", type=int, default=4, help="Number of concurrent workers")
+    parser.add_argument("--max_tokens", type=int, default=1024, help="Maximum tokens to generate")
+    parser.add_argument("--temperature", type=float, default=0.0, help="Sampling temperature")
+    parser.add_argument("--output", "-o", type=str, default=None, help="Output file for detailed results")
+
+    args = parser.parse_args()
+
+    print("=" * 60)
+    print("BFCL (Berkeley Function Calling Leaderboard) Evaluation")
+    print("=" * 60)
+    print(f"Model: {args.model_name}")
+    print(f"API URL: {args.base_url}")
+    print(f"Test Category: {args.test_category}")
+    print()
+
+    evaluator = BFCLEvaluator(
+        base_url=args.base_url,
+        model_name=args.model_name,
+        api_key=args.api_key,
+        max_tokens=args.max_tokens,
+        temperature=args.temperature,
+    )
+
+    # Determine categories to evaluate
+    if args.test_category == "all":
+        categories = list(TEST_CATEGORIES.keys())
+    else:
+        categories = [args.test_category]
+
+    all_results = {}
+
+    for category in categories:
+        result = evaluator.evaluate_category(category, limit=args.limit, num_workers=args.num_workers)
+        all_results[category] = result
+
+        print(f"\n{category.upper()} Results:")
+        print(f"  Total: {result['total']}")
+        print(f"  Passed: {result['passed']}")
+        print(f"  Accuracy: {result['accuracy']:.2f}%")
+
+        if result.get("error_summary"):
+            print("  Common errors:")
+            for error, count in sorted(result["error_summary"].items(), key=lambda x: -x[1])[:5]:
+                print(f"    - {error}: {count}")
+
+    # Print summary
+    print("\n" + "=" * 60)
+    print("SUMMARY")
+    print("=" * 60)
+    print(f"{'Category':<25} {'Total':>8} {'Passed':>8} {'Accuracy':>10}")
+    print("-" * 60)
+
+    total_all = 0
+    passed_all = 0
+
+    for category, result in all_results.items():
+        print(f"{category:<25} {result['total']:>8} {result['passed']:>8} {result['accuracy']:>9.2f}%")
+        total_all += result["total"]
+        passed_all += result["passed"]
+
+    if len(all_results) > 1:
+        print("-" * 60)
+        overall_acc = passed_all / total_all * 100 if total_all > 0 else 0
+        print(f"{'OVERALL':<25} {total_all:>8} {passed_all:>8} {overall_acc:>9.2f}%")
+
+    print("=" * 60)
+
+    # Save detailed results
+    if args.output:
+        output_data = {
+            "model": args.model_name,
+            "config": {
+                "base_url": args.base_url,
+                "max_tokens": args.max_tokens,
+                "temperature": args.temperature,
+            },
+            "results": {
+                cat: {
+                    "total": r["total"],
+                    "passed": r["passed"],
+                    "accuracy": r["accuracy"],
+                    "error_summary": r.get("error_summary", {}),
+                }
+                for cat, r in all_results.items()
+            },
+        }
+        with open(args.output, "w") as f:
+            json.dump(output_data, f, indent=2)
+        print(f"\nResults saved to {args.output}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/eval/requirements.txt b/test/eval/requirements.txt
new file mode 100644
index 000000000..e57d2da41
--- /dev/null
+++ b/test/eval/requirements.txt
@@ -0,0 +1,13 @@
+# Evaluation benchmark dependencies
+aiohttp>=3.8.0
+tqdm>=4.64.0
+transformers>=4.30.0
+numpy>=1.21.0
+openai>=1.0.0
+huggingface_hub>=0.20.0
+
+# Optional: official human-eval package for dataset loading
+# pip install git+https://github.com/openai/human-eval.git
+
+# Optional: official BFCL evaluation package
+# pip install bfcl-eval
diff --git a/test/eval/run_bfcl.sh b/test/eval/run_bfcl.sh
new file mode 100755
index 000000000..2e68f8380
--- /dev/null
+++ b/test/eval/run_bfcl.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+# BFCL (Berkeley Function Calling Leaderboard) evaluation script for LightLLM
+#
+# Prerequisites:
+# 1. Start LightLLM server:
+#    python -m lightllm.server.api_server \
+#        --model_dir /path/to/GLM-4.7-Flash \
+#        --tp 1 \
+#        --port 8000
+#
+# 2. Install dependencies:
+#    pip install openai tqdm datasets
+
+set -e
+
+# Configuration
+MODEL_NAME="${MODEL_NAME:-GLM-4.7-Flash}"
+BASE_URL="${BASE_URL:-http://localhost:8000/v1}"
+PORT="${PORT:-8000}"
+TEST_CATEGORY="${TEST_CATEGORY:-simple}"
+NUM_WORKERS="${NUM_WORKERS:-4}"
+
+# Check if server is running
+if ! curl -s "http://localhost:${PORT}/health" > /dev/null 2>&1; then
+    echo "Error: LightLLM server not running on port ${PORT}"
+    echo "Start the server first with:"
+    echo "  python -m lightllm.server.api_server --model_dir /path/to/model --tp 1 --port ${PORT}"
+    exit 1
+fi
+
+echo "=========================================="
+echo "BFCL Function Calling Evaluation"
+echo "=========================================="
+echo "Model: ${MODEL_NAME}"
+echo "Server: ${BASE_URL}"
+echo "Test Category: ${TEST_CATEGORY}"
+echo ""
+
+# Run evaluation
+python "$(dirname "$0")/eval_bfcl.py" \
+    --model_name "${MODEL_NAME}" \
+    --base_url "${BASE_URL}" \
+    --test_category "${TEST_CATEGORY}" \
+    --num_workers "${NUM_WORKERS}" \
+    --output "bfcl_results_${TEST_CATEGORY}_$(date +%Y%m%d_%H%M%S).json"
+
+echo ""
+echo "Evaluation complete!"