diff --git a/last_bench/BENCHMARK_REPORT.md b/last_bench/BENCHMARK_REPORT.md new file mode 100644 index 000000000..834490c5f --- /dev/null +++ b/last_bench/BENCHMARK_REPORT.md @@ -0,0 +1,123 @@ +# GLM-4.7-Flash Attention Backend Performance Report + +Generated from 63 benchmark measurements. + +**Backends compared**: triton, fa3, flashmla, flashinfer + +**Scenarios**: +- 1000→1000: Short input, short output +- 1000→8000: Short input, long output +- 8000→1000: Long input, short output + +**Concurrency levels**: 10, 80, 320 + +--- + +## Executive Summary + +Best performing backend per scenario (highest output throughput): + +| Scenario | Best Backend | Max Throughput (tok/s) | @ Concurrency | +|----------|--------------|------------------------|---------------| +| 1000→1000 | **lightllm-flashinfer** | 2,768.4 | 320 | +| 1000→8000 | **lightllm-flashinfer** | 2,684.0 | 320 | +| 8000→1000 | **lightllm-fa3** | 1,985.7 | 320 | + +--- + +## Detailed Results by Scenario + +### Scenario: 1000→1000 (Short Input → Short Output) + +| Concurrency | Backend | QPS | Throughput (tok/s) | TTFT (ms) | P99 TTFT (ms) | ITL (ms) | P99 ITL (ms) | +|-------------|------------|-------|-------------------|-----------|---------------|----------|--------------| +| 10 | triton | 0.24 | 101.5 | 269.6 | 1261.0 | 9.20 | 10.18 | +| | fa3 | 0.26 | 110.9 | 105.9 | 176.0 | 8.75 | 8.96 | +| | flashmla | 0.26 | 109.9 | 112.2 | 221.7 | 8.82 | 9.25 | +| | flashinfer | 0.28 | 117.3 | 113.2 | 164.7 | 8.25 | 8.50 | +| | lightllm-fa3 | 0.35 | 149.5 | 60.6 | 72.1 | 6.56 | 8.40 | +| | lightllm-flashinfer | 0.35 | 149.6 | 58.6 | 72.7 | 6.56 | 8.41 | +| | lightllm-triton | 0.35 | 146.4 | 54.3 | 70.3 | 6.72 | 8.42 | +| 80 | triton | 1.36 | 694.5 | 392.1 | 1561.8 | 19.45 | 90.76 | +| | fa3 | 1.50 | 764.0 | 153.1 | 337.5 | 18.00 | 88.61 | +| | flashmla | 1.45 | 739.7 | 184.9 | 429.6 | 18.50 | 94.12 | +| | flashinfer | 1.67 | 853.5 | 166.6 | 383.4 | 16.12 | 91.41 | +| | lightllm-fa3 | 2.04 | 1,041.0 | 65.8 | 103.2 | 13.13 | 15.34 | +| | lightllm-flashinfer | 2.05 | 1,046.3 | 63.6 | 125.0 | 13.07 | 15.35 | +| | lightllm-triton | 1.73 | 880.7 | 67.6 | 97.6 | 15.79 | 18.57 | +| 320 | triton | 3.63 | 1,928.4 | 144.5 | 260.2 | 29.49 | 98.22 | +| | fa3 | 3.93 | 2,090.8 | 189.7 | 498.5 | 27.11 | 94.84 | +| | flashmla | 3.76 | 2,000.3 | 269.5 | 868.0 | 28.32 | 99.13 | +| | flashinfer | 4.21 | 2,236.0 | 162.3 | 314.6 | 25.97 | 96.63 | +| | lightllm-fa3 | 5.02 | 2,668.5 | 158.7 | 430.5 | 21.16 | 80.35 | +| | lightllm-flashinfer | 5.21 | 2,768.4 | 137.9 | 531.4 | 20.33 | 94.61 | +| | lightllm-triton | 4.39 | 2,332.7 | 123.9 | 276.3 | 24.11 | 80.39 | + +### Scenario: 1000→8000 (Short Input → Long Output) + +| Concurrency | Backend | QPS | Throughput (tok/s) | TTFT (ms) | P99 TTFT (ms) | ITL (ms) | P99 ITL (ms) | +|-------------|------------|-------|-------------------|-----------|---------------|----------|--------------| +| 10 | triton | 0.02 | 84.1 | 143.2 | 384.8 | 11.86 | 16.50 | +| | fa3 | 0.03 | 112.9 | 112.0 | 138.9 | 8.84 | 9.10 | +| | flashmla | 0.02 | 100.4 | 164.7 | 664.4 | 9.93 | 11.77 | +| | flashinfer | 0.03 | 120.0 | 123.0 | 223.8 | 8.34 | 8.74 | +| | lightllm-fa3 | 0.03 | 149.8 | 206.3 | 593.4 | 6.64 | 8.45 | +| | lightllm-flashinfer | 0.03 | 147.9 | 154.0 | 230.5 | 6.73 | 8.45 | +| | lightllm-triton | 0.02 | 110.4 | 200.0 | 571.9 | 9.02 | 14.45 | +| 80 | triton | 0.16 | 623.7 | 250.5 | 880.0 | 22.17 | 27.21 | +| | fa3 | 0.21 | 840.0 | 214.5 | 720.5 | 16.77 | 17.85 | +| | flashmla | 0.21 | 820.2 | 294.6 | 1089.6 | 17.01 | 18.01 | +| | flashinfer | 0.24 | 947.0 | 214.8 | 738.7 | 14.99 | 15.76 | +| | lightllm-fa3 | 0.26 | 1,054.0 | 468.2 | 2356.6 | 13.35 | 15.25 | +| | lightllm-flashinfer | 0.26 | 1,050.6 | 206.3 | 701.2 | 13.42 | 15.21 | +| | lightllm-triton | 0.17 | 677.4 | 167.8 | 393.4 | 20.18 | 23.16 | +| 320 | triton | 0.40 | 1,634.9 | 331.7 | 1511.1 | 34.20 | 75.71 | +| | fa3 | 0.57 | 2,310.4 | 284.6 | 1195.5 | 24.40 | 73.23 | +| | flashmla | 0.58 | 2,367.8 | 347.4 | 1505.3 | 23.62 | 96.43 | +| | flashinfer | 0.64 | 2,615.2 | 289.3 | 1173.0 | 21.98 | 91.45 | +| | lightllm-fa3 | 0.65 | 2,660.1 | 170.5 | 400.1 | 21.42 | 76.17 | +| | lightllm-flashinfer | 0.66 | 2,684.0 | 213.3 | 557.1 | 21.18 | 94.81 | +| | lightllm-triton | 0.42 | 1,710.8 | 221.4 | 679.5 | 32.72 | 78.24 | + +### Scenario: 8000→1000 (Long Input → Short Output) + +| Concurrency | Backend | QPS | Throughput (tok/s) | TTFT (ms) | P99 TTFT (ms) | ITL (ms) | P99 ITL (ms) | +|-------------|------------|-------|-------------------|-----------|---------------|----------|--------------| +| 10 | triton | 0.17 | 73.7 | 217.7 | 426.0 | 13.08 | 16.85 | +| | fa3 | 0.25 | 107.0 | 208.0 | 464.6 | 8.87 | 9.13 | +| | flashmla | 0.22 | 93.8 | 188.3 | 448.2 | 10.24 | 11.91 | +| | flashinfer | 0.27 | 112.9 | 169.5 | 270.5 | 8.47 | 8.89 | +| | lightllm-fa3 | 0.34 | 142.7 | 151.8 | 276.2 | 6.67 | 8.47 | +| | lightllm-flashinfer | 0.33 | 140.7 | 139.2 | 242.7 | 6.79 | 8.49 | +| | lightllm-triton | 0.22 | 94.7 | 167.5 | 366.2 | 10.19 | 14.63 | +| 80 | triton | 0.87 | 454.3 | 1275.7 | 8538.7 | 29.22 | 199.05 | +| | fa3 | 1.34 | 697.6 | 451.7 | 2349.4 | 19.57 | 110.56 | +| | flashmla | 1.31 | 682.1 | 538.0 | 2850.3 | 19.76 | 103.51 | +| | flashinfer | 1.43 | 745.5 | 453.6 | 2351.6 | 18.35 | 109.11 | +| | lightllm-fa3 | 1.76 | 919.2 | 181.4 | 734.0 | 15.08 | 88.50 | +| | lightllm-flashinfer | 1.76 | 914.9 | 184.9 | 661.3 | 15.13 | 93.04 | +| | lightllm-triton | 1.12 | 581.7 | 225.4 | 973.7 | 23.53 | 105.45 | +| 320 | triton | 1.50 | 795.5 | 3873.7 | 30295.7 | 68.73 | 464.47 | +| | fa3 | 2.74 | 1,453.1 | 1138.1 | 8315.6 | 38.80 | 196.49 | +| | flashmla | 2.84 | 1,506.9 | 1201.1 | 8968.1 | 37.03 | 181.73 | +| | flashinfer | 2.85 | 1,515.0 | 1119.6 | 8247.4 | 37.54 | 195.69 | +| | lightllm-fa3 | 3.74 | 1,985.7 | 277.2 | 673.9 | 29.14 | 147.19 | +| | lightllm-flashinfer | 3.70 | 1,964.2 | 273.3 | 717.0 | 29.49 | 134.72 | +| | lightllm-triton | 2.36 | 1,254.9 | 313.2 | 717.6 | 46.08 | 199.79 | + +--- + +## Key Findings + +1. **Highest Throughput**: lightllm-flashinfer achieves 2,768.4 tok/s on 1000→1000 @ concurrency 320 +2. **Lowest TTFT**: lightllm-triton achieves 54.3ms on 1000→1000 @ concurrency 10 +3. **Lowest ITL**: lightllm-fa3 achieves 6.56ms on 1000→1000 @ concurrency 10 + +### Concurrency Scaling (1000→8000 scenario) + +| Backend | 10 conc | 80 conc | 320 conc | Scale Factor | +|---------|---------|---------|----------|--------------| +| triton | 84.1 | 623.7 | 1,634.9 | 19.4x | +| fa3 | 112.9 | 840.0 | 2,310.4 | 20.5x | +| flashmla | 100.4 | 820.2 | 2,367.8 | 23.6x | +| flashinfer | 120.0 | 947.0 | 2,615.2 | 21.8x | diff --git a/last_bench/bench.sh b/last_bench/bench.sh new file mode 100644 index 000000000..54aba7883 --- /dev/null +++ b/last_bench/bench.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash +set -euo pipefail +IFS=$'\n\t' + +log() { printf '%s - %s\n' "$(date -u +"%Y-%m-%dT%H:%M:%SZ")" "$*" >&2; } + +input_len=(1000 8000) +output_len=(8000 1000) +num_prompts=(10 80 320) +max_concurrencys=(1 16 64) +tags=("triton" "fa3" "flashmla" "flashinfer") +ports=(30000 30001 30002 30003) + +# Model path can be overridden by env var MODEL_PATH +MODEL_PATH=${MODEL_PATH:-/dev/shm/GLM-4.7-Flash} +if [ ! -e "$MODEL_PATH" ]; then + log "Warning: model path '$MODEL_PATH' does not exist. Proceeding anyway." +else + log "Using model path: $MODEL_PATH" +fi + +if ! command -v python >/dev/null 2>&1; then + log "Error: python not found in PATH" + exit 1 +fi + +count1=${#output_len[@]} +for ((i=0; i&2; } + +input_len=(1000) +output_len=(1000) +num_prompts=(10 80 320) +max_concurrencys=(1 16 64) +tags=("lightllm-fa3" "lightllm-flashinfer" "lightllm-triton") +ports=(24000 24001 24002) + +# Model path can be overridden by env var MODEL_PATH +MODEL_PATH=${MODEL_PATH:-/dev/shm/GLM-4.7-Flash} +if [ ! -e "$MODEL_PATH" ]; then + log "Warning: model path '$MODEL_PATH' does not exist. Proceeding anyway." +else + log "Using model path: $MODEL_PATH" +fi + +if ! command -v python >/dev/null 2>&1; then + log "Error: python not found in PATH" + exit 1 +fi + +count1=${#output_len[@]} +for ((i=0; i tuple[int, int, int] | None: + """Extract concurrency, input_len, output_len from filename. + + Pattern: sglang-oai_0123_{concurrency}_{input}_{output}.jsonl + """ + if not filename.endswith(".jsonl"): + return None + parts = filename.replace(".jsonl", "").split("_") + if len(parts) < 5: + return None + try: + concurrency = int(parts[2]) + input_len = int(parts[3]) + output_len = int(parts[4]) + return concurrency, input_len, output_len + except (ValueError, IndexError): + return None + + +def load_benchmark_data(data_dir: Path) -> list[BenchmarkResult]: + """Load all benchmark results from JSONL files.""" + results = [] + + for filepath in data_dir.glob("sglang-oai_*.jsonl"): + parsed = parse_filename(filepath.name) + if not parsed: + continue + concurrency, input_len, output_len = parsed + + with open(filepath) as f: + for line in f: + data = json.loads(line) + results.append( + BenchmarkResult( + tag=data["tag"], + concurrency=concurrency, + input_len=input_len, + output_len=output_len, + request_throughput=data["request_throughput"], + output_throughput=data["output_throughput"], + mean_ttft_ms=data["mean_ttft_ms"], + p99_ttft_ms=data["p99_ttft_ms"], + mean_itl_ms=data["mean_itl_ms"], + p99_itl_ms=data["p99_itl_ms"], + ) + ) + + return results + + +def group_by_scenario(results: list[BenchmarkResult]) -> dict[str, list[BenchmarkResult]]: + """Group results by scenario (input_len -> output_len).""" + grouped = defaultdict(list) + for r in results: + key = f"{r.input_len}→{r.output_len}" + grouped[key].append(r) + return grouped + + +def find_best_backend(results: list[BenchmarkResult]) -> tuple[str, float]: + """Find the backend with highest throughput for given results.""" + best = max(results, key=lambda r: r.output_throughput) + return best.tag, best.output_throughput + + +def generate_executive_summary(scenarios: dict[str, list[BenchmarkResult]]) -> str: + """Generate executive summary table.""" + lines = [ + "## Executive Summary", + "", + "Best performing backend per scenario (highest output throughput):", + "", + "| Scenario | Best Backend | Max Throughput (tok/s) | @ Concurrency |", + "|----------|--------------|------------------------|---------------|", + ] + + scenario_order = ["1000→1000", "1000→8000", "8000→1000"] + for scenario in scenario_order: + if scenario not in scenarios: + continue + results = scenarios[scenario] + best = max(results, key=lambda r: r.output_throughput) + lines.append(f"| {scenario} | **{best.tag}** | {best.output_throughput:,.1f} | {best.concurrency} |") + + lines.append("") + return "\n".join(lines) + + +def generate_scenario_table(scenario: str, results: list[BenchmarkResult]) -> str: + """Generate detailed comparison table for a scenario.""" + # Scenario descriptions + scenario_desc = { + "1000→1000": "Short Input → Short Output", + "1000→8000": "Short Input → Long Output", + "8000→1000": "Long Input → Short Output", + } + desc = scenario_desc.get(scenario, scenario) + + lines = [ + f"### Scenario: {scenario} ({desc})", + "", + "| Concurrency | Backend | QPS | Throughput (tok/s) | TTFT (ms) | P99 TTFT (ms) | ITL (ms) | P99 ITL (ms) |", + "|-------------|------------|-------|-------------------|-----------|---------------|----------|--------------|", + ] + + # Sort by concurrency first, then by backend + backend_order = ["triton", "fa3", "flashmla", "flashinfer"] + concurrency_order = [10, 80, 320] + + sorted_results = sorted( + results, + key=lambda r: ( + concurrency_order.index(r.concurrency) if r.concurrency in concurrency_order else 99, + backend_order.index(r.tag) if r.tag in backend_order else 99, + ), + ) + + current_concurrency = None + for r in sorted_results: + conc_display = str(r.concurrency) if r.concurrency != current_concurrency else "" + current_concurrency = r.concurrency + + lines.append( + f"| {conc_display:11} | {r.tag:10} | " + f"{r.request_throughput:5.2f} | " + f"{r.output_throughput:17,.1f} | " + f"{r.mean_ttft_ms:9.1f} | " + f"{r.p99_ttft_ms:13.1f} | " + f"{r.mean_itl_ms:8.2f} | " + f"{r.p99_itl_ms:12.2f} |" + ) + + lines.append("") + return "\n".join(lines) + + +def generate_key_findings(results: list[BenchmarkResult]) -> str: + """Generate key findings section.""" + lines = [ + "## Key Findings", + "", + ] + + # 1. Highest overall throughput + best_overall = max(results, key=lambda r: r.output_throughput) + lines.append( + f"1. **Highest Throughput**: {best_overall.tag} achieves " + f"{best_overall.output_throughput:,.1f} tok/s on " + f"{best_overall.input_len}→{best_overall.output_len} @ concurrency {best_overall.concurrency}" + ) + + # 2. Lowest TTFT + best_ttft = min(results, key=lambda r: r.mean_ttft_ms) + lines.append( + f"2. **Lowest TTFT**: {best_ttft.tag} achieves " + f"{best_ttft.mean_ttft_ms:.1f}ms on " + f"{best_ttft.input_len}→{best_ttft.output_len} @ concurrency {best_ttft.concurrency}" + ) + + # 3. Lowest ITL + best_itl = min(results, key=lambda r: r.mean_itl_ms) + lines.append( + f"3. **Lowest ITL**: {best_itl.tag} achieves " + f"{best_itl.mean_itl_ms:.2f}ms on " + f"{best_itl.input_len}→{best_itl.output_len} @ concurrency {best_itl.concurrency}" + ) + + # 4. Concurrency scaling analysis + lines.append("") + lines.append("### Concurrency Scaling (1000→8000 scenario)") + lines.append("") + lines.append("| Backend | 10 conc | 80 conc | 320 conc | Scale Factor |") + lines.append("|---------|---------|---------|----------|--------------|") + + scenario_results = [r for r in results if r.input_len == 1000 and r.output_len == 8000] + backend_perf = defaultdict(dict) + for r in scenario_results: + backend_perf[r.tag][r.concurrency] = r.output_throughput + + for backend in ["triton", "fa3", "flashmla", "flashinfer"]: + if backend not in backend_perf: + continue + perf = backend_perf[backend] + if 10 in perf and 320 in perf: + scale = perf[320] / perf[10] if perf[10] > 0 else 0 + lines.append( + f"| {backend:8} | {perf.get(10, 0):7,.1f} | " + f"{perf.get(80, 0):7,.1f} | {perf.get(320, 0):8,.1f} | {scale:12.1f}x |" + ) + + lines.append("") + return "\n".join(lines) + + +def generate_report(data_dir: Path) -> str: + """Generate the full benchmark report.""" + results = load_benchmark_data(data_dir) + + if not results: + return "# Error\n\nNo benchmark data found." + + scenarios = group_by_scenario(results) + + report = [ + "# GLM-4.7-Flash Attention Backend Performance Report", + "", + f"Generated from {len(results)} benchmark measurements.", + "", + "**Backends compared**: triton, fa3, flashmla, flashinfer", + "", + "**Scenarios**:", + "- 1000→1000: Short input, short output", + "- 1000→8000: Short input, long output", + "- 8000→1000: Long input, short output", + "", + "**Concurrency levels**: 10, 80, 320", + "", + "---", + "", + generate_executive_summary(scenarios), + "---", + "", + "## Detailed Results by Scenario", + "", + ] + + # Generate tables in consistent order + for scenario in ["1000→1000", "1000→8000", "8000→1000"]: + if scenario in scenarios: + report.append(generate_scenario_table(scenario, scenarios[scenario])) + + report.append("---") + report.append("") + report.append(generate_key_findings(results)) + + return "\n".join(report) + + +def main(): + script_dir = Path(__file__).parent + report = generate_report(script_dir) + + output_path = script_dir / "BENCHMARK_REPORT.md" + with open(output_path, "w") as f: + f.write(report) + + print(f"Report generated: {output_path}") + print(report) + + +if __name__ == "__main__": + main() diff --git a/last_bench/sglang-oai_0123_10_1000_1000.jsonl b/last_bench/sglang-oai_0123_10_1000_1000.jsonl new file mode 100644 index 000000000..dca82481a --- /dev/null +++ b/last_bench/sglang-oai_0123_10_1000_1000.jsonl @@ -0,0 +1,7 @@ +{"tag": "triton", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 1, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": null, "duration": 41.56710090395063, "completed": 10, "total_input_tokens": 6101, "total_input_text_tokens": 6101, "total_input_vision_tokens": 0, "total_output_tokens": 4220, "total_output_tokens_retokenized": 4219, "request_throughput": 0.24057487249608928, "input_throughput": 146.77472970986406, "output_throughput": 101.52259619334967, "total_throughput": 248.29732590321373, "mean_e2e_latency_ms": 4141.624511638656, "median_e2e_latency_ms": 3203.8242494454607, "std_e2e_latency_ms": 2485.9917727216202, "p90_e2e_latency_ms": 7356.209329259581, "p99_e2e_latency_ms": 9327.435238955077, "mean_ttft_ms": 269.6008140454069, "median_ttft_ms": 96.12666291650385, "std_ttft_ms": 370.8697777080779, "p99_ttft_ms": 1260.9661481156945, "mean_tpot_ms": 9.034784566756835, "median_tpot_ms": 9.118396484478719, "std_tpot_ms": 0.4145035968282636, "p99_tpot_ms": 9.59984958153813, "mean_itl_ms": 9.197189547221072, "median_itl_ms": 9.218816994689405, "std_itl_ms": 0.49230433913096455, "p95_itl_ms": 9.893044945783913, "p99_itl_ms": 10.180208240635693, "concurrency": 0.9963707888141476, "accept_length": null, "max_output_tokens_per_s": 120.0, "max_concurrent_requests": 2} +{"tag": "fa3", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 1, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": null, "duration": 38.03990558185615, "completed": 10, "total_input_tokens": 6101, "total_input_text_tokens": 6101, "total_input_vision_tokens": 0, "total_output_tokens": 4220, "total_output_tokens_retokenized": 4218, "request_throughput": 0.2628818302001698, "input_throughput": 160.3842046051236, "output_throughput": 110.93613234447167, "total_throughput": 271.3203369495953, "mean_e2e_latency_ms": 3789.365150523372, "median_e2e_latency_ms": 3030.7643914129585, "std_e2e_latency_ms": 2347.649909442257, "p90_e2e_latency_ms": 6861.860890546814, "p99_e2e_latency_ms": 8518.549765171483, "mean_ttft_ms": 105.94553446862847, "median_ttft_ms": 94.56806734669954, "std_ttft_ms": 27.94510373820782, "p99_ttft_ms": 176.04060200043023, "mean_tpot_ms": 8.735117569860298, "median_tpot_ms": 8.751374765776054, "std_tpot_ms": 0.04156977680352886, "p99_tpot_ms": 8.76773780092932, "mean_itl_ms": 8.749201726566584, "median_itl_ms": 8.751522051170468, "std_itl_ms": 0.14378221568601485, "p95_itl_ms": 8.877559565007687, "p99_itl_ms": 8.959061477798969, "concurrency": 0.9961552460663261, "accept_length": null, "max_output_tokens_per_s": 115.0, "max_concurrent_requests": 2} +{"tag": "flashmla", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 1, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": null, "duration": 38.40182148804888, "completed": 10, "total_input_tokens": 6101, "total_input_text_tokens": 6101, "total_input_vision_tokens": 0, "total_output_tokens": 4220, "total_output_tokens_retokenized": 4215, "request_throughput": 0.26040431449617885, "input_throughput": 158.8726722741187, "output_throughput": 109.89062071738748, "total_throughput": 268.7632929915062, "mean_e2e_latency_ms": 3825.576449208893, "median_e2e_latency_ms": 3036.6330899996683, "std_e2e_latency_ms": 2415.8852833995174, "p90_e2e_latency_ms": 6976.653711660765, "p99_e2e_latency_ms": 8777.255602276418, "mean_ttft_ms": 112.22376921214163, "median_ttft_ms": 94.2523704143241, "std_ttft_ms": 44.1469574516126, "p99_ttft_ms": 221.72820456326008, "mean_tpot_ms": 8.747894237986943, "median_tpot_ms": 8.784269532090837, "std_tpot_ms": 0.1802130309851715, "p99_tpot_ms": 8.992865002797622, "mean_itl_ms": 8.820299398443815, "median_itl_ms": 8.825775003060699, "std_itl_ms": 0.22406159202597428, "p95_itl_ms": 9.14960989030078, "p99_itl_ms": 9.246604079380631, "concurrency": 0.9961966128089678, "accept_length": null, "max_output_tokens_per_s": 118.0, "max_concurrent_requests": 2} +{"tag": "flashinfer", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 1, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": null, "duration": 35.988416022853926, "completed": 10, "total_input_tokens": 6101, "total_input_text_tokens": 6101, "total_input_vision_tokens": 0, "total_output_tokens": 4220, "total_output_tokens_retokenized": 3918, "request_throughput": 0.27786718908800107, "input_throughput": 169.52677206258946, "output_throughput": 117.25995379513645, "total_throughput": 286.7867258577259, "mean_e2e_latency_ms": 3584.2942386865616, "median_e2e_latency_ms": 2880.021173041314, "std_e2e_latency_ms": 2210.4487376161856, "p90_e2e_latency_ms": 6500.186281767674, "p99_e2e_latency_ms": 8042.241794501898, "mean_ttft_ms": 113.1668952992186, "median_ttft_ms": 102.46228356845677, "std_ttft_ms": 23.684222136567993, "p99_ttft_ms": 164.71267985878512, "mean_tpot_ms": 8.226941944769623, "median_tpot_ms": 8.265041945772195, "std_tpot_ms": 0.08986236974100431, "p99_tpot_ms": 8.326550089040984, "mean_itl_ms": 8.250821643859407, "median_itl_ms": 8.252424886450171, "std_itl_ms": 0.3354421269852178, "p95_itl_ms": 8.401609491556883, "p99_itl_ms": 8.499805759638548, "concurrency": 0.9959577649681517, "accept_length": null, "max_output_tokens_per_s": 123.0, "max_concurrent_requests": 2} +{"tag": "lightllm-fa3", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 1, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": null, "duration": 28.220483848825097, "completed": 10, "total_input_tokens": 6101, "total_input_text_tokens": 6101, "total_input_vision_tokens": 0, "total_output_tokens": 4220, "total_output_tokens_retokenized": 4219, "request_throughput": 0.35435253532750216, "input_throughput": 216.19048180330907, "output_throughput": 149.5367699082059, "total_throughput": 365.727251711515, "mean_e2e_latency_ms": 2821.2477078894153, "median_e2e_latency_ms": 2261.518812039867, "std_e2e_latency_ms": 1752.5711889017846, "p90_e2e_latency_ms": 5119.068665453233, "p99_e2e_latency_ms": 6318.86261240812, "mean_ttft_ms": 60.56937051471323, "median_ttft_ms": 61.048407456837595, "std_ttft_ms": 10.235625045106131, "p99_ttft_ms": 72.07421808270738, "mean_tpot_ms": 6.549284494990456, "median_tpot_ms": 6.55624221613761, "std_tpot_ms": 0.026076747154878004, "p99_tpot_ms": 6.5676378215672475, "mean_itl_ms": 6.558974201037124, "median_itl_ms": 6.219592876732349, "std_itl_ms": 0.8434367069907109, "p95_itl_ms": 8.309309324249625, "p99_itl_ms": 8.402309883385898, "concurrency": 0.9997162780775186, "accept_length": null, "max_output_tokens_per_s": 153.0, "max_concurrent_requests": 2} +{"tag": "lightllm-flashinfer", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 1, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": null, "duration": 28.21518262499012, "completed": 10, "total_input_tokens": 6101, "total_input_text_tokens": 6101, "total_input_vision_tokens": 0, "total_output_tokens": 4220, "total_output_tokens_retokenized": 4220, "request_throughput": 0.3544191130325353, "input_throughput": 216.2311008611498, "output_throughput": 149.5648656997299, "total_throughput": 365.7959665608797, "mean_e2e_latency_ms": 2820.835434575565, "median_e2e_latency_ms": 2276.5965425642207, "std_e2e_latency_ms": 1759.2921206087265, "p90_e2e_latency_ms": 5145.300170383415, "p99_e2e_latency_ms": 6369.066203290132, "mean_ttft_ms": 58.56175431981683, "median_ttft_ms": 52.00532451272011, "std_ttft_ms": 10.69389116363121, "p99_ttft_ms": 72.72814616095275, "mean_tpot_ms": 6.5418024206640935, "median_tpot_ms": 6.5778018938299265, "std_tpot_ms": 0.10667361038828493, "p99_tpot_ms": 6.681071237688413, "mean_itl_ms": 6.562763047186636, "median_itl_ms": 6.223411066457629, "std_itl_ms": 0.8022116495868372, "p95_itl_ms": 8.315606787800789, "p99_itl_ms": 8.406318174675107, "concurrency": 0.9997579927330181, "accept_length": null, "max_output_tokens_per_s": 158.0, "max_concurrent_requests": 2} +{"tag": "lightllm-triton", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 1, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": null, "duration": 28.820174572058022, "completed": 10, "total_input_tokens": 6101, "total_input_text_tokens": 6101, "total_input_vision_tokens": 0, "total_output_tokens": 4220, "total_output_tokens_retokenized": 4220, "request_throughput": 0.3469791612468331, "input_throughput": 211.6919862766929, "output_throughput": 146.42520604616357, "total_throughput": 358.11719232285645, "mean_e2e_latency_ms": 2881.241054646671, "median_e2e_latency_ms": 2301.2861979659647, "std_e2e_latency_ms": 1878.5403060746103, "p90_e2e_latency_ms": 5358.587349089794, "p99_e2e_latency_ms": 6771.017882206944, "mean_ttft_ms": 54.279814031906426, "median_ttft_ms": 50.7953364867717, "std_ttft_ms": 8.100198475876049, "p99_ttft_ms": 70.34216834232211, "mean_tpot_ms": 6.580841954172073, "median_tpot_ms": 6.67901388540887, "std_tpot_ms": 0.350263682853537, "p99_tpot_ms": 7.056966771200029, "mean_itl_ms": 6.716453389913626, "median_itl_ms": 6.224676966667175, "std_itl_ms": 0.9832705612541962, "p95_itl_ms": 8.326812135055661, "p99_itl_ms": 8.418112937361002, "concurrency": 0.9997306044912427, "accept_length": null, "max_output_tokens_per_s": 165.0, "max_concurrent_requests": 2} diff --git a/last_bench/sglang-oai_0123_10_1000_8000.jsonl b/last_bench/sglang-oai_0123_10_1000_8000.jsonl new file mode 100644 index 000000000..b6c1a8b66 --- /dev/null +++ b/last_bench/sglang-oai_0123_10_1000_8000.jsonl @@ -0,0 +1,7 @@ +{"tag": "triton", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 1, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 8000, "random_range_ratio": 0.0, "server_info": {"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30000, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": null, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 68555837, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "triton", "decode_attention_backend": null, "prefill_attention_backend": null, "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "status": "ready", "max_total_num_tokens": 1386225, "max_req_input_len": 202746, "internal_states": [{"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30000, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": 3500, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 68555837, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "triton", "decode_attention_backend": "triton", "prefill_attention_backend": "triton", "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": true, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "use_mla_backend": true, "last_gen_throughput": 68.4354602959235, "memory_usage": {"weight": 56.33, "kvcache": 69.9, "token_capacity": 1386225, "graph": 1.69}, "effective_max_running_requests_per_dp": 3500}], "version": "0.0.0.dev1+g1e8e0cca2"}, "duration": 528.4287535739131, "completed": 10, "total_input_tokens": 6101, "total_input_text_tokens": 6101, "total_input_vision_tokens": 0, "total_output_tokens": 44462, "total_output_tokens_retokenized": 44449, "request_throughput": 0.01892402699960434, "input_throughput": 11.545548872458607, "output_throughput": 84.1400088456408, "total_throughput": 95.68555771809942, "mean_e2e_latency_ms": 52840.63192738686, "median_e2e_latency_ms": 54310.54771656636, "std_e2e_latency_ms": 32405.91208850261, "p90_e2e_latency_ms": 95479.23450903034, "p99_e2e_latency_ms": 100470.4938506335, "mean_ttft_ms": 143.15742638427764, "median_ttft_ms": 105.27272755280137, "std_ttft_ms": 89.88346432853983, "p99_ttft_ms": 384.79625445324933, "mean_tpot_ms": 11.041872593796489, "median_tpot_ms": 11.331380416371044, "std_tpot_ms": 1.5086878772657037, "p99_tpot_ms": 12.987873410365033, "mean_itl_ms": 11.857849385685803, "median_itl_ms": 11.53362705372274, "std_itl_ms": 2.106867488894714, "p95_itl_ms": 15.640562167391181, "p99_itl_ms": 16.497120447456837, "concurrency": 0.999957545270024, "accept_length": null, "max_output_tokens_per_s": 121.0, "max_concurrent_requests": 2} +{"tag": "fa3", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 1, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 8000, "random_range_ratio": 0.0, "server_info": {"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30001, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": null, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 1062566034, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "fa3", "decode_attention_backend": null, "prefill_attention_backend": null, "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "status": "ready", "max_total_num_tokens": 1386225, "max_req_input_len": 202746, "internal_states": [{"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30001, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": 3500, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 1062566034, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "fa3", "decode_attention_backend": "fa3", "prefill_attention_backend": "fa3", "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "use_mla_backend": true, "last_gen_throughput": 111.72392428829242, "memory_usage": {"weight": 56.33, "kvcache": 69.9, "token_capacity": 1386225, "graph": 2.36}, "effective_max_running_requests_per_dp": 3500}], "version": "0.0.0.dev1+g1e8e0cca2"}, "duration": 393.95918229292147, "completed": 10, "total_input_tokens": 6101, "total_input_text_tokens": 6101, "total_input_vision_tokens": 0, "total_output_tokens": 44462, "total_output_tokens_retokenized": 44449, "request_throughput": 0.025383340329315322, "input_throughput": 15.486375934915278, "output_throughput": 112.85940777220178, "total_throughput": 128.34578370711705, "mean_e2e_latency_ms": 39394.25162828993, "median_e2e_latency_ms": 42470.40231700521, "std_e2e_latency_ms": 21636.962726875605, "p90_e2e_latency_ms": 66409.18134008534, "p99_e2e_latency_ms": 68663.32626647316, "mean_ttft_ms": 111.9626430561766, "median_ttft_ms": 101.87083506025374, "std_ttft_ms": 17.221441402427743, "p99_ttft_ms": 138.93227798631418, "mean_tpot_ms": 8.802688235486457, "median_tpot_ms": 8.818598380117141, "std_tpot_ms": 0.06861274446224537, "p99_tpot_ms": 8.872189325777665, "mean_itl_ms": 8.83899862207603, "median_itl_ms": 8.827421930618584, "std_itl_ms": 0.18904639497524522, "p95_itl_ms": 9.004280308727175, "p99_itl_ms": 9.104600874707103, "concurrency": 0.9999576960995675, "accept_length": null, "max_output_tokens_per_s": 115.0, "max_concurrent_requests": 2} +{"tag": "flashmla", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 1, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 8000, "random_range_ratio": 0.0, "server_info": {"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30002, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 64, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": null, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 1011399013, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "flashmla", "decode_attention_backend": null, "prefill_attention_backend": null, "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "status": "ready", "max_total_num_tokens": 1386176, "max_req_input_len": 202746, "internal_states": [{"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30002, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 64, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": 3500, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 1011399013, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "flashmla", "decode_attention_backend": "flashmla", "prefill_attention_backend": "flashmla", "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "use_mla_backend": true, "last_gen_throughput": 90.13693125010776, "memory_usage": {"weight": 56.33, "kvcache": 69.9, "token_capacity": 1386176, "graph": 1.21}, "effective_max_running_requests_per_dp": 3500}], "version": "0.0.0.dev1+g1e8e0cca2"}, "duration": 443.0159071299713, "completed": 10, "total_input_tokens": 6101, "total_input_text_tokens": 6101, "total_input_vision_tokens": 0, "total_output_tokens": 44462, "total_output_tokens_retokenized": 44439, "request_throughput": 0.022572552901731845, "input_throughput": 13.7715145253466, "output_throughput": 100.36208471168014, "total_throughput": 114.13359923702673, "mean_e2e_latency_ms": 44299.69251912553, "median_e2e_latency_ms": 46765.18414099701, "std_e2e_latency_ms": 25605.205296934604, "p90_e2e_latency_ms": 77119.38225780614, "p99_e2e_latency_ms": 80277.98145196866, "mean_ttft_ms": 164.6503965370357, "median_ttft_ms": 98.8025760743767, "std_ttft_ms": 184.20579272535184, "p99_ttft_ms": 664.3669738760218, "mean_tpot_ms": 9.603287608095876, "median_tpot_ms": 9.73514685671584, "std_tpot_ms": 0.6061427340384441, "p99_tpot_ms": 10.30806361595789, "mean_itl_ms": 9.931152820655184, "median_itl_ms": 9.84680699184537, "std_itl_ms": 0.8601496753226304, "p95_itl_ms": 11.435698019340634, "p99_itl_ms": 11.772641446441412, "concurrency": 0.9999571529184155, "accept_length": null, "max_output_tokens_per_s": 118.0, "max_concurrent_requests": 2} +{"tag": "flashinfer", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 1, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 8000, "random_range_ratio": 0.0, "server_info": {"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30003, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": null, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 493872944, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "flashinfer", "decode_attention_backend": null, "prefill_attention_backend": null, "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "status": "ready", "max_total_num_tokens": 1386222, "max_req_input_len": 202746, "internal_states": [{"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30003, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": 3500, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 493872944, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "flashinfer", "decode_attention_backend": "flashinfer", "prefill_attention_backend": "flashinfer", "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "use_mla_backend": true, "last_gen_throughput": 116.78249468138866, "memory_usage": {"weight": 56.33, "kvcache": 69.9, "token_capacity": 1386222, "graph": 1.71}, "effective_max_running_requests_per_dp": 3500}], "version": "0.0.0.dev1+g1e8e0cca2"}, "duration": 370.64318000199273, "completed": 10, "total_input_tokens": 6101, "total_input_text_tokens": 6101, "total_input_vision_tokens": 0, "total_output_tokens": 44462, "total_output_tokens_retokenized": 41404, "request_throughput": 0.026980126816163828, "input_throughput": 16.46057537054155, "output_throughput": 119.95903985002761, "total_throughput": 136.41961522056914, "mean_e2e_latency_ms": 37062.73019104265, "median_e2e_latency_ms": 39886.838593520224, "std_e2e_latency_ms": 20455.257739139928, "p90_e2e_latency_ms": 62683.871512743644, "p99_e2e_latency_ms": 64792.23027887987, "mean_ttft_ms": 122.99183551222086, "median_ttft_ms": 100.1137139974162, "std_ttft_ms": 45.0379113541844, "p99_ttft_ms": 223.7851848336868, "mean_tpot_ms": 8.248293524737983, "median_tpot_ms": 8.286151737614464, "std_tpot_ms": 0.12572598005984015, "p99_tpot_ms": 8.366339839761116, "mean_itl_ms": 8.340799370035327, "median_itl_ms": 8.37263313587755, "std_itl_ms": 2.689567428376299, "p95_itl_ms": 8.641683915629983, "p99_itl_ms": 8.737169874366373, "concurrency": 0.9999571607075943, "accept_length": null, "max_output_tokens_per_s": 131.0, "max_concurrent_requests": 2} +{"tag": "lightllm-fa3", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 1, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 8000, "random_range_ratio": 0.0, "server_info": null, "duration": 296.87213805201463, "completed": 10, "total_input_tokens": 6101, "total_input_text_tokens": 6101, "total_input_vision_tokens": 0, "total_output_tokens": 44462, "total_output_tokens_retokenized": 44460, "request_throughput": 0.03368453525351682, "input_throughput": 20.550934958170615, "output_throughput": 149.7681806441865, "total_throughput": 170.3191156023571, "mean_e2e_latency_ms": 29686.422172025777, "median_e2e_latency_ms": 31977.473180973902, "std_e2e_latency_ms": 16341.231223519195, "p90_e2e_latency_ms": 50013.6461708229, "p99_e2e_latency_ms": 52065.068823597394, "mean_ttft_ms": 206.2621021643281, "median_ttft_ms": 163.88948948588222, "std_ttft_ms": 148.8543077347853, "p99_ttft_ms": 593.3920549717733, "mean_tpot_ms": 6.604791340804418, "median_tpot_ms": 6.613572524991938, "std_tpot_ms": 0.05190789972059481, "p99_tpot_ms": 6.664961845437845, "mean_itl_ms": 6.637431996100034, "median_itl_ms": 6.223700940608978, "std_itl_ms": 0.8980564150345929, "p95_itl_ms": 8.328629587776959, "p99_itl_ms": 8.446590006351471, "concurrency": 0.9999733342043857, "accept_length": null, "max_output_tokens_per_s": 153.0, "max_concurrent_requests": 2} +{"tag": "lightllm-flashinfer", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 1, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 8000, "random_range_ratio": 0.0, "server_info": null, "duration": 300.56394808203913, "completed": 10, "total_input_tokens": 6101, "total_input_text_tokens": 6101, "total_input_vision_tokens": 0, "total_output_tokens": 44462, "total_output_tokens_retokenized": 44459, "request_throughput": 0.033270790005960706, "input_throughput": 20.298508982636626, "output_throughput": 147.9285865245025, "total_throughput": 168.22709550713913, "mean_e2e_latency_ms": 30055.323450965807, "median_e2e_latency_ms": 32303.914289921522, "std_e2e_latency_ms": 16614.672857992904, "p90_e2e_latency_ms": 50797.29593801312, "p99_e2e_latency_ms": 52831.220259703696, "mean_ttft_ms": 153.99227743037045, "median_ttft_ms": 154.41859746351838, "std_ttft_ms": 40.539656892122096, "p99_ttft_ms": 230.53783738985658, "mean_tpot_ms": 6.687357886347274, "median_tpot_ms": 6.697858736484612, "std_tpot_ms": 0.0747425716300049, "p99_tpot_ms": 6.810022278762517, "mean_itl_ms": 6.7319551750847815, "median_itl_ms": 6.230928935110569, "std_itl_ms": 1.0190941574897405, "p95_itl_ms": 8.343739341944456, "p99_itl_ms": 8.449985329061745, "concurrency": 0.9999643550983096, "accept_length": null, "max_output_tokens_per_s": 157.0, "max_concurrent_requests": 2} +{"tag": "lightllm-triton", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 1, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 8000, "random_range_ratio": 0.0, "server_info": null, "duration": 402.82201804290526, "completed": 10, "total_input_tokens": 6101, "total_input_text_tokens": 6101, "total_input_vision_tokens": 0, "total_output_tokens": 44462, "total_output_tokens_retokenized": 44462, "request_throughput": 0.024824859496470928, "input_throughput": 15.145646778796912, "output_throughput": 110.37629029320904, "total_throughput": 125.52193707200595, "mean_e2e_latency_ms": 40281.3204290811, "median_e2e_latency_ms": 40955.73136943858, "std_e2e_latency_ms": 25145.381849844358, "p90_e2e_latency_ms": 73342.79716589954, "p99_e2e_latency_ms": 78086.60535515985, "mean_ttft_ms": 200.0209585763514, "median_ttft_ms": 171.22362996451557, "std_ttft_ms": 140.57473101890918, "p99_ttft_ms": 571.9467777409592, "mean_tpot_ms": 8.330440810182232, "median_tpot_ms": 8.543196817424905, "std_tpot_ms": 1.272966997457874, "p99_tpot_ms": 10.03476722604763, "mean_itl_ms": 9.024065831756937, "median_itl_ms": 8.3220349624753, "std_itl_ms": 2.06384829597527, "p95_itl_ms": 12.451428978238255, "p99_itl_ms": 14.448113285470754, "concurrency": 0.9999781199842622, "accept_length": null, "max_output_tokens_per_s": 163.0, "max_concurrent_requests": 2} diff --git a/last_bench/sglang-oai_0123_10_8000_1000.jsonl b/last_bench/sglang-oai_0123_10_8000_1000.jsonl new file mode 100644 index 000000000..ab41238d5 --- /dev/null +++ b/last_bench/sglang-oai_0123_10_8000_1000.jsonl @@ -0,0 +1,7 @@ +{"tag": "triton", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 1, "sharegpt_output_len": null, "random_input_len": 8000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": {"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30000, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": null, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 604505033, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "triton", "decode_attention_backend": null, "prefill_attention_backend": null, "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "status": "ready", "max_total_num_tokens": 1386224, "max_req_input_len": 202746, "internal_states": [{"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30000, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": 3500, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 604505033, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "triton", "decode_attention_backend": "triton", "prefill_attention_backend": "triton", "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": true, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "use_mla_backend": true, "last_gen_throughput": 73.51944093316781, "memory_usage": {"weight": 56.33, "kvcache": 69.9, "token_capacity": 1386224, "graph": 1.69}, "effective_max_running_requests_per_dp": 3500}], "version": "0.0.0.dev1+g1e8e0cca2"}, "duration": 57.25094021507539, "completed": 10, "total_input_tokens": 41941, "total_input_text_tokens": 41941, "total_input_vision_tokens": 0, "total_output_tokens": 4220, "total_output_tokens_retokenized": 4220, "request_throughput": 0.1746696204889014, "input_throughput": 732.5818552925012, "output_throughput": 73.71057984631638, "total_throughput": 806.2924351388176, "mean_e2e_latency_ms": 5722.978232218884, "median_e2e_latency_ms": 4641.29843947012, "std_e2e_latency_ms": 3921.2995928261576, "p90_e2e_latency_ms": 12518.891114904545, "p99_e2e_latency_ms": 12771.664290658664, "mean_ttft_ms": 217.65351290814579, "median_ttft_ms": 177.99088754691184, "std_ttft_ms": 112.65007093616718, "p99_ttft_ms": 425.97797273192555, "mean_tpot_ms": 12.595701176155286, "median_tpot_ms": 13.11032631924969, "std_tpot_ms": 2.5773326227353888, "p99_tpot_ms": 16.479375187077174, "mean_itl_ms": 13.076763751810404, "median_itl_ms": 13.160730595700443, "std_itl_ms": 2.441664585133937, "p95_itl_ms": 16.710556542966515, "p99_itl_ms": 16.849553296342492, "concurrency": 0.9996304358879161, "accept_length": null, "max_output_tokens_per_s": 110.0, "max_concurrent_requests": 2} +{"tag": "fa3", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 1, "sharegpt_output_len": null, "random_input_len": 8000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": {"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30001, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": null, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 398782077, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "fa3", "decode_attention_backend": null, "prefill_attention_backend": null, "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "status": "ready", "max_total_num_tokens": 1386225, "max_req_input_len": 202746, "internal_states": [{"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30001, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": 3500, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 398782077, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "fa3", "decode_attention_backend": "fa3", "prefill_attention_backend": "fa3", "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "use_mla_backend": true, "last_gen_throughput": 112.54967193050531, "memory_usage": {"weight": 56.33, "kvcache": 69.9, "token_capacity": 1386225, "graph": 2.36}, "effective_max_running_requests_per_dp": 3500}], "version": "0.0.0.dev1+g1e8e0cca2"}, "duration": 39.43922132183798, "completed": 10, "total_input_tokens": 41941, "total_input_text_tokens": 41941, "total_input_vision_tokens": 0, "total_output_tokens": 4220, "total_output_tokens_retokenized": 4220, "request_throughput": 0.25355470176240213, "input_throughput": 1063.433774661691, "output_throughput": 107.0000841437337, "total_throughput": 1170.4338588054245, "mean_e2e_latency_ms": 3942.344288667664, "median_e2e_latency_ms": 3168.642749893479, "std_e2e_latency_ms": 2358.272403480565, "p90_e2e_latency_ms": 7263.652879674918, "p99_e2e_latency_ms": 8585.422784292605, "mean_ttft_ms": 208.0219859490171, "median_ttft_ms": 162.48971258755773, "std_ttft_ms": 122.61083935112063, "p99_ttft_ms": 464.63888159254566, "mean_tpot_ms": 8.857586609576051, "median_tpot_ms": 8.86137127263336, "std_tpot_ms": 0.07794326844451306, "p99_tpot_ms": 8.973473299517387, "mean_itl_ms": 8.870108765793297, "median_itl_ms": 8.866838878020644, "std_itl_ms": 0.13255333227863098, "p95_itl_ms": 9.031242376659065, "p99_itl_ms": 9.132659959141165, "concurrency": 0.999599930357839, "accept_length": null, "max_output_tokens_per_s": 115.0, "max_concurrent_requests": 2} +{"tag": "flashmla", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 1, "sharegpt_output_len": null, "random_input_len": 8000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": {"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30002, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 64, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": null, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 379113072, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "flashmla", "decode_attention_backend": null, "prefill_attention_backend": null, "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "status": "ready", "max_total_num_tokens": 1386176, "max_req_input_len": 202746, "internal_states": [{"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30002, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 64, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": 3500, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 379113072, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "flashmla", "decode_attention_backend": "flashmla", "prefill_attention_backend": "flashmla", "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "use_mla_backend": true, "last_gen_throughput": 94.37640848866731, "memory_usage": {"weight": 56.33, "kvcache": 69.9, "token_capacity": 1386176, "graph": 1.21}, "effective_max_running_requests_per_dp": 3500}], "version": "0.0.0.dev1+g1e8e0cca2"}, "duration": 44.99840556993149, "completed": 10, "total_input_tokens": 41941, "total_input_text_tokens": 41941, "total_input_vision_tokens": 0, "total_output_tokens": 4220, "total_output_tokens_retokenized": 4220, "request_throughput": 0.2222300962299457, "input_throughput": 932.0552465980153, "output_throughput": 93.78110060903708, "total_throughput": 1025.8363472070523, "mean_e2e_latency_ms": 4497.922634705901, "median_e2e_latency_ms": 3725.210659438744, "std_e2e_latency_ms": 2773.6783957420403, "p90_e2e_latency_ms": 8437.325873808004, "p99_e2e_latency_ms": 9939.448532455135, "mean_ttft_ms": 188.3448517182842, "median_ttft_ms": 149.44852597545832, "std_ttft_ms": 112.42184826547498, "p99_ttft_ms": 448.1769835739397, "mean_tpot_ms": 10.145170876880718, "median_tpot_ms": 10.421453350218435, "std_tpot_ms": 0.9104786801424741, "p99_tpot_ms": 11.68051373533182, "mean_itl_ms": 10.236510755759697, "median_itl_ms": 10.264639975503087, "std_itl_ms": 0.8240081868908503, "p95_itl_ms": 11.802727438043803, "p99_itl_ms": 11.90974765922874, "concurrency": 0.9995737799455433, "accept_length": null, "max_output_tokens_per_s": 113.0, "max_concurrent_requests": 2} +{"tag": "flashinfer", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 1, "sharegpt_output_len": null, "random_input_len": 8000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": {"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30003, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": null, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 253908945, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "flashinfer", "decode_attention_backend": null, "prefill_attention_backend": null, "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "status": "ready", "max_total_num_tokens": 1386222, "max_req_input_len": 202746, "internal_states": [{"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30003, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": 3500, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 253908945, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "flashinfer", "decode_attention_backend": "flashinfer", "prefill_attention_backend": "flashinfer", "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "use_mla_backend": true, "last_gen_throughput": 117.92113979515487, "memory_usage": {"weight": 56.33, "kvcache": 69.9, "token_capacity": 1386222, "graph": 1.71}, "effective_max_running_requests_per_dp": 3500}], "version": "0.0.0.dev1+g1e8e0cca2"}, "duration": 37.378284689970315, "completed": 10, "total_input_tokens": 41941, "total_input_text_tokens": 41941, "total_input_vision_tokens": 0, "total_output_tokens": 4220, "total_output_tokens_retokenized": 4026, "request_throughput": 0.2675350161984103, "input_throughput": 1122.0686114377527, "output_throughput": 112.89977683572914, "total_throughput": 1234.9683882734819, "mean_e2e_latency_ms": 3736.1401027301326, "median_e2e_latency_ms": 3029.271812643856, "std_e2e_latency_ms": 2279.0599616437844, "p90_e2e_latency_ms": 6922.7269294671705, "p99_e2e_latency_ms": 8203.730608876795, "mean_ttft_ms": 169.47351093403995, "median_ttft_ms": 149.5549235260114, "std_ttft_ms": 73.39107755634313, "p99_ttft_ms": 270.47318189404905, "mean_tpot_ms": 8.438028617355302, "median_tpot_ms": 8.468548115720909, "std_tpot_ms": 0.17177658018927264, "p99_tpot_ms": 8.68933005085632, "mean_itl_ms": 8.471877963532142, "median_itl_ms": 8.466457016766071, "std_itl_ms": 0.2015365133171823, "p95_itl_ms": 8.739279105793685, "p99_itl_ms": 8.892325791530311, "concurrency": 0.9995483029034363, "accept_length": null, "max_output_tokens_per_s": 122.0, "max_concurrent_requests": 2} +{"tag": "lightllm-fa3", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 1, "sharegpt_output_len": null, "random_input_len": 8000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": null, "duration": 29.579997348831967, "completed": 10, "total_input_tokens": 41941, "total_input_text_tokens": 41941, "total_input_vision_tokens": 0, "total_output_tokens": 4220, "total_output_tokens_retokenized": 4220, "request_throughput": 0.3380662912870367, "input_throughput": 1417.8838322869606, "output_throughput": 142.66397492312947, "total_throughput": 1560.54780721009, "mean_e2e_latency_ms": 2957.1972485166043, "median_e2e_latency_ms": 2444.4284136407077, "std_e2e_latency_ms": 1800.5337827383514, "p90_e2e_latency_ms": 5347.466037608682, "p99_e2e_latency_ms": 6513.965703966096, "mean_ttft_ms": 151.79783792700619, "median_ttft_ms": 153.29680847935379, "std_ttft_ms": 74.49831125817643, "p99_ttft_ms": 276.19091010885313, "mean_tpot_ms": 6.6520214002182465, "median_tpot_ms": 6.663941853427483, "std_tpot_ms": 0.06995932396737312, "p99_tpot_ms": 6.761680932473347, "mean_itl_ms": 6.665223839852749, "median_itl_ms": 6.226972909644246, "std_itl_ms": 0.9274401703373519, "p95_itl_ms": 8.330000611022115, "p99_itl_ms": 8.470633188262582, "concurrency": 0.9997287064102377, "accept_length": null, "max_output_tokens_per_s": 153.0, "max_concurrent_requests": 3} +{"tag": "lightllm-flashinfer", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 1, "sharegpt_output_len": null, "random_input_len": 8000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": null, "duration": 29.990934344939888, "completed": 10, "total_input_tokens": 41941, "total_input_text_tokens": 41941, "total_input_vision_tokens": 0, "total_output_tokens": 4220, "total_output_tokens_retokenized": 4220, "request_throughput": 0.33343409328249934, "input_throughput": 1398.4559306361305, "output_throughput": 140.70918736521472, "total_throughput": 1539.1651180013453, "mean_e2e_latency_ms": 2998.289765859954, "median_e2e_latency_ms": 2464.879463543184, "std_e2e_latency_ms": 1852.130967789295, "p90_e2e_latency_ms": 5536.782492487691, "p99_e2e_latency_ms": 6639.743499069009, "mean_ttft_ms": 139.17544910218567, "median_ttft_ms": 142.81450887210667, "std_ttft_ms": 65.24136773664321, "p99_ttft_ms": 242.65094643458724, "mean_tpot_ms": 6.75879133261816, "median_tpot_ms": 6.808080749864214, "std_tpot_ms": 0.18544835486888658, "p99_tpot_ms": 7.013304534186441, "mean_itl_ms": 6.792839756158036, "median_itl_ms": 6.243136944249272, "std_itl_ms": 0.9610731556465486, "p95_itl_ms": 8.374860789626837, "p99_itl_ms": 8.49069862626493, "concurrency": 0.999732029477711, "accept_length": null, "max_output_tokens_per_s": 154.0, "max_concurrent_requests": 3} +{"tag": "lightllm-triton", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 1, "sharegpt_output_len": null, "random_input_len": 8000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": null, "duration": 44.55748701398261, "completed": 10, "total_input_tokens": 41941, "total_input_text_tokens": 41941, "total_input_vision_tokens": 0, "total_output_tokens": 4220, "total_output_tokens_retokenized": 4220, "request_throughput": 0.22442917386390968, "input_throughput": 941.2783981026236, "output_throughput": 94.70911137056987, "total_throughput": 1035.9875094731933, "mean_e2e_latency_ms": 4454.932378418744, "median_e2e_latency_ms": 3474.5054844534025, "std_e2e_latency_ms": 3141.729941933886, "p90_e2e_latency_ms": 9705.666761891916, "p99_e2e_latency_ms": 10252.758897030726, "mean_ttft_ms": 167.52417001407593, "median_ttft_ms": 153.05893344338983, "std_ttft_ms": 96.79885770899398, "p99_ttft_ms": 366.1548413033597, "mean_tpot_ms": 9.747877234945388, "median_tpot_ms": 10.067643957439982, "std_tpot_ms": 2.2756355104449306, "p99_tpot_ms": 13.484334798556336, "mean_itl_ms": 10.186269659067515, "median_itl_ms": 10.326585033908486, "std_itl_ms": 2.367790916680719, "p95_itl_ms": 14.481087820604444, "p99_itl_ms": 14.626262942329049, "concurrency": 0.9998167933081008, "accept_length": null, "max_output_tokens_per_s": 148.0, "max_concurrent_requests": 2} diff --git a/last_bench/sglang-oai_0123_320_1000_1000.jsonl b/last_bench/sglang-oai_0123_320_1000_1000.jsonl new file mode 100644 index 000000000..1881fdfce --- /dev/null +++ b/last_bench/sglang-oai_0123_320_1000_1000.jsonl @@ -0,0 +1,7 @@ +{"tag": "triton", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 64, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": null, "duration": 88.22740700491704, "completed": 320, "total_input_tokens": 158939, "total_input_text_tokens": 158939, "total_input_vision_tokens": 0, "total_output_tokens": 170134, "total_output_tokens_retokenized": 169083, "request_throughput": 3.626990873506754, "input_throughput": 1801.4696951384065, "output_throughput": 1928.3577039787442, "total_throughput": 3729.8273991171504, "mean_e2e_latency_ms": 15789.648024459893, "median_e2e_latency_ms": 16212.67285104841, "std_e2e_latency_ms": 8072.337017063825, "p90_e2e_latency_ms": 26709.808869380508, "p99_e2e_latency_ms": 31045.851741307415, "mean_ttft_ms": 144.5240720640868, "median_ttft_ms": 109.16357941459864, "std_ttft_ms": 61.89498059484625, "p99_ttft_ms": 260.22138732951134, "mean_tpot_ms": 29.608090827655733, "median_tpot_ms": 30.36084325541004, "std_tpot_ms": 2.7705613573890226, "p99_tpot_ms": 33.954402691723445, "mean_itl_ms": 29.485199626481318, "median_itl_ms": 24.739603977650404, "std_itl_ms": 17.225318489747373, "p95_itl_ms": 73.9279053057544, "p99_itl_ms": 98.22126995772122, "concurrency": 57.268909280599985, "accept_length": null, "max_output_tokens_per_s": 2495.0, "max_concurrent_requests": 71} +{"tag": "fa3", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 64, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": null, "duration": 81.37246890296228, "completed": 320, "total_input_tokens": 158939, "total_input_text_tokens": 158939, "total_input_vision_tokens": 0, "total_output_tokens": 170134, "total_output_tokens_retokenized": 169225, "request_throughput": 3.9325339923212126, "input_throughput": 1953.2281881423164, "output_throughput": 2090.805432029929, "total_throughput": 4044.0336201722453, "mean_e2e_latency_ms": 14576.79473591852, "median_e2e_latency_ms": 14943.263828055933, "std_e2e_latency_ms": 7403.160837428686, "p90_e2e_latency_ms": 24588.386960700154, "p99_e2e_latency_ms": 28584.442388296593, "mean_ttft_ms": 189.66343759384472, "median_ttft_ms": 111.13140103407204, "std_ttft_ms": 154.90940140833771, "p99_ttft_ms": 498.54426054283977, "mean_tpot_ms": 27.2643124012368, "median_tpot_ms": 27.90378232804311, "std_tpot_ms": 2.606168417806973, "p99_tpot_ms": 31.501365364149077, "mean_itl_ms": 27.111262237968155, "median_itl_ms": 22.098568035289645, "std_itl_ms": 17.53854613363614, "p95_itl_ms": 73.02747792564332, "p99_itl_ms": 94.83545971103013, "concurrency": 57.323740798088494, "accept_length": null, "max_output_tokens_per_s": 2815.0, "max_concurrent_requests": 72} +{"tag": "flashmla", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 64, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": null, "duration": 85.05533684790134, "completed": 320, "total_input_tokens": 158939, "total_input_text_tokens": 158939, "total_input_vision_tokens": 0, "total_output_tokens": 170134, "total_output_tokens_retokenized": 169005, "request_throughput": 3.7622565715333556, "input_throughput": 1868.6540538216875, "output_throughput": 2000.2742485664248, "total_throughput": 3868.928302388112, "mean_e2e_latency_ms": 15298.422675980692, "median_e2e_latency_ms": 15641.821417491883, "std_e2e_latency_ms": 7749.526213849034, "p90_e2e_latency_ms": 25955.183803243566, "p99_e2e_latency_ms": 30152.963120699864, "mean_ttft_ms": 269.53684461695957, "median_ttft_ms": 118.97248448804021, "std_ttft_ms": 275.2191731553012, "p99_ttft_ms": 867.960302028805, "mean_tpot_ms": 28.51670166651952, "median_tpot_ms": 29.272230424085432, "std_tpot_ms": 3.0552503112772613, "p99_tpot_ms": 33.589946233320624, "mean_itl_ms": 28.321577751000884, "median_itl_ms": 21.607821458019316, "std_itl_ms": 22.36865151668966, "p95_itl_ms": 95.35049657570198, "p99_itl_ms": 99.12627678597336, "concurrency": 57.55659124680326, "accept_length": null, "max_output_tokens_per_s": 2815.0, "max_concurrent_requests": 71} +{"tag": "flashinfer", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 64, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": null, "duration": 76.08857471891679, "completed": 320, "total_input_tokens": 158939, "total_input_text_tokens": 158939, "total_input_vision_tokens": 0, "total_output_tokens": 170134, "total_output_tokens_retokenized": 154408, "request_throughput": 4.205624841602442, "input_throughput": 2088.868145935783, "output_throughput": 2235.9993025037184, "total_throughput": 4324.8674484395015, "mean_e2e_latency_ms": 13736.860047160008, "median_e2e_latency_ms": 14124.713889556006, "std_e2e_latency_ms": 7050.329882103861, "p90_e2e_latency_ms": 23447.995013161566, "p99_e2e_latency_ms": 27480.407421826385, "mean_ttft_ms": 162.28689183481038, "median_ttft_ms": 113.47088008187711, "std_ttft_ms": 78.54445987281336, "p99_ttft_ms": 314.5761683699675, "mean_tpot_ms": 25.760918661083565, "median_tpot_ms": 26.645654844490704, "std_tpot_ms": 3.0593642182725933, "p99_tpot_ms": 30.497802868581097, "mean_itl_ms": 25.969708663598038, "median_itl_ms": 19.10600601695478, "std_itl_ms": 49.776817453535145, "p95_itl_ms": 91.81465758010745, "p99_itl_ms": 96.63003749214113, "concurrency": 57.77207985995222, "accept_length": null, "max_output_tokens_per_s": 3077.0, "max_concurrent_requests": 71} +{"tag": "lightllm-fa3", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 64, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": null, "duration": 63.755531140137464, "completed": 320, "total_input_tokens": 158939, "total_input_text_tokens": 158939, "total_input_vision_tokens": 0, "total_output_tokens": 170134, "total_output_tokens_retokenized": 169953, "request_throughput": 5.019172364772962, "input_throughput": 2492.944489014531, "output_throughput": 2668.537097213385, "total_throughput": 5161.481586227916, "mean_e2e_latency_ms": 11383.221806850634, "median_e2e_latency_ms": 11569.44180233404, "std_e2e_latency_ms": 5830.871063837923, "p90_e2e_latency_ms": 18649.839925765995, "p99_e2e_latency_ms": 23242.17696905136, "mean_ttft_ms": 158.7225136347115, "median_ttft_ms": 99.41273042932153, "std_ttft_ms": 128.79054902232136, "p99_ttft_ms": 430.53666956257075, "mean_tpot_ms": 21.295283829261166, "median_tpot_ms": 20.846842278524452, "std_tpot_ms": 2.433949297342979, "p99_tpot_ms": 26.92992232441266, "mean_itl_ms": 21.162570676283387, "median_itl_ms": 19.59298853762448, "std_itl_ms": 11.532776164590596, "p95_itl_ms": 21.707738342229277, "p99_itl_ms": 80.35093266516924, "concurrency": 57.13435231502565, "accept_length": null, "max_output_tokens_per_s": 3383.0, "max_concurrent_requests": 73} +{"tag": "lightllm-flashinfer", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 64, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": null, "duration": 61.45518663688563, "completed": 320, "total_input_tokens": 158939, "total_input_text_tokens": 158939, "total_input_vision_tokens": 0, "total_output_tokens": 170134, "total_output_tokens_retokenized": 170000, "request_throughput": 5.207046264308875, "input_throughput": 2586.2585193843383, "output_throughput": 2768.4237785372693, "total_throughput": 5354.682297921608, "mean_e2e_latency_ms": 10922.24139619575, "median_e2e_latency_ms": 11354.776699910872, "std_e2e_latency_ms": 5564.939199084309, "p90_e2e_latency_ms": 18001.203981018625, "p99_e2e_latency_ms": 21854.658021384384, "mean_ttft_ms": 137.85723637993215, "median_ttft_ms": 92.63190545607358, "std_ttft_ms": 111.70441609815067, "p99_ttft_ms": 531.437413871754, "mean_tpot_ms": 20.47186250275375, "median_tpot_ms": 19.981252363955022, "std_tpot_ms": 2.0238247748229328, "p99_tpot_ms": 26.164035337201277, "mean_itl_ms": 20.334099341341997, "median_itl_ms": 19.39774421043694, "std_itl_ms": 12.074799257794227, "p95_itl_ms": 21.160434209741645, "p99_itl_ms": 94.60746685508639, "concurrency": 56.87261625994083, "accept_length": null, "max_output_tokens_per_s": 3439.0, "max_concurrent_requests": 76} +{"tag": "lightllm-triton", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 64, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": null, "duration": 72.93456637696363, "completed": 320, "total_input_tokens": 158939, "total_input_text_tokens": 158939, "total_input_vision_tokens": 0, "total_output_tokens": 170134, "total_output_tokens_retokenized": 169880, "request_throughput": 4.387494378811745, "input_throughput": 2179.1999033561247, "output_throughput": 2332.693652014867, "total_throughput": 4511.893555370992, "mean_e2e_latency_ms": 12913.627206679666, "median_e2e_latency_ms": 13505.511358496733, "std_e2e_latency_ms": 6557.100403482816, "p90_e2e_latency_ms": 21201.299887429923, "p99_e2e_latency_ms": 25048.928881627508, "mean_ttft_ms": 123.85591540514724, "median_ttft_ms": 92.86466846242547, "std_ttft_ms": 61.161896228451845, "p99_ttft_ms": 276.2570247449912, "mean_tpot_ms": 24.217108368398687, "median_tpot_ms": 23.87942465522868, "std_tpot_ms": 1.7713138005549998, "p99_tpot_ms": 28.78332211641795, "mean_itl_ms": 24.109886530009778, "median_itl_ms": 23.613386088982224, "std_itl_ms": 10.00878708819206, "p95_itl_ms": 24.657311802729964, "p99_itl_ms": 80.39279339835048, "concurrency": 56.65846677937745, "accept_length": null, "max_output_tokens_per_s": 2932.0, "max_concurrent_requests": 74} diff --git a/last_bench/sglang-oai_0123_320_1000_8000.jsonl b/last_bench/sglang-oai_0123_320_1000_8000.jsonl new file mode 100644 index 000000000..15f9e8e1c --- /dev/null +++ b/last_bench/sglang-oai_0123_320_1000_8000.jsonl @@ -0,0 +1,7 @@ +{"tag": "triton", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 64, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 8000, "random_range_ratio": 0.0, "server_info": {"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30000, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": null, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 604505033, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "triton", "decode_attention_backend": null, "prefill_attention_backend": null, "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "status": "ready", "max_total_num_tokens": 1386224, "max_req_input_len": 202746, "internal_states": [{"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30000, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": 3500, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 604505033, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "triton", "decode_attention_backend": "triton", "prefill_attention_backend": "triton", "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": true, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "use_mla_backend": true, "last_gen_throughput": 60.77974941333156, "memory_usage": {"weight": 56.33, "kvcache": 69.9, "token_capacity": 1386224, "graph": 1.69}, "effective_max_running_requests_per_dp": 3500}], "version": "0.0.0.dev1+g1e8e0cca2"}, "duration": 795.775275953114, "completed": 320, "total_input_tokens": 158939, "total_input_text_tokens": 158939, "total_input_vision_tokens": 0, "total_output_tokens": 1301025, "total_output_tokens_retokenized": 1290119, "request_throughput": 0.40212357642894897, "input_throughput": 199.72849723137725, "output_throughput": 1634.9150813233541, "total_throughput": 1834.6435785547314, "mean_e2e_latency_ms": 139349.8814202183, "median_e2e_latency_ms": 143803.1537218485, "std_e2e_latency_ms": 78727.69232926994, "p90_e2e_latency_ms": 247236.3009268418, "p99_e2e_latency_ms": 278858.07280272944, "mean_ttft_ms": 331.6782287663955, "median_ttft_ms": 114.08616451080889, "std_ttft_ms": 449.4168910393588, "p99_ttft_ms": 1511.0559103870764, "mean_tpot_ms": 34.17474498115479, "median_tpot_ms": 35.2410475091058, "std_tpot_ms": 2.6142427195879208, "p99_tpot_ms": 37.41304436548783, "mean_itl_ms": 34.197111507755174, "median_itl_ms": 34.45811802521348, "std_itl_ms": 7.2511462249701575, "p95_itl_ms": 37.52578243147582, "p99_itl_ms": 75.70609046146272, "concurrency": 56.03587269164813, "accept_length": null, "max_output_tokens_per_s": 2752.0, "max_concurrent_requests": 68} +{"tag": "fa3", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 64, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 8000, "random_range_ratio": 0.0, "server_info": {"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30001, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": null, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 398782077, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "fa3", "decode_attention_backend": null, "prefill_attention_backend": null, "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "status": "ready", "max_total_num_tokens": 1386225, "max_req_input_len": 202746, "internal_states": [{"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30001, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": 3500, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 398782077, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "fa3", "decode_attention_backend": "fa3", "prefill_attention_backend": "fa3", "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "use_mla_backend": true, "last_gen_throughput": 111.68147398118492, "memory_usage": {"weight": 56.33, "kvcache": 69.9, "token_capacity": 1386225, "graph": 2.36}, "effective_max_running_requests_per_dp": 3500}], "version": "0.0.0.dev1+g1e8e0cca2"}, "duration": 563.1268216208555, "completed": 320, "total_input_tokens": 158939, "total_input_text_tokens": 158939, "total_input_vision_tokens": 0, "total_output_tokens": 1301025, "total_output_tokens_retokenized": 1289330, "request_throughput": 0.5682556534582027, "input_throughput": 282.24370407810403, "output_throughput": 2310.358786063932, "total_throughput": 2592.602490142036, "mean_e2e_latency_ms": 99475.42591601159, "median_e2e_latency_ms": 102824.72171052359, "std_e2e_latency_ms": 55369.15182006764, "p90_e2e_latency_ms": 176665.20955280866, "p99_e2e_latency_ms": 194569.74888183875, "mean_ttft_ms": 284.5753706082178, "median_ttft_ms": 99.63217296171933, "std_ttft_ms": 370.4153459018554, "p99_ttft_ms": 1195.5236710840836, "mean_tpot_ms": 24.49880321564915, "median_tpot_ms": 24.85869417485708, "std_tpot_ms": 1.065935851021897, "p99_tpot_ms": 25.785650772846513, "mean_itl_ms": 24.39991724061233, "median_itl_ms": 24.06020206399262, "std_itl_ms": 6.6672981127611495, "p95_itl_ms": 24.959364032838494, "p99_itl_ms": 73.22543800109997, "concurrency": 56.5274731569362, "accept_length": null, "max_output_tokens_per_s": 2944.0, "max_concurrent_requests": 68} +{"tag": "flashmla", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 64, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 8000, "random_range_ratio": 0.0, "server_info": {"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30002, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 64, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": null, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 379113072, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "flashmla", "decode_attention_backend": null, "prefill_attention_backend": null, "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "status": "ready", "max_total_num_tokens": 1386176, "max_req_input_len": 202746, "internal_states": [{"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30002, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 64, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": 3500, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 379113072, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "flashmla", "decode_attention_backend": "flashmla", "prefill_attention_backend": "flashmla", "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "use_mla_backend": true, "last_gen_throughput": 98.18409159571067, "memory_usage": {"weight": 56.33, "kvcache": 69.9, "token_capacity": 1386176, "graph": 1.21}, "effective_max_running_requests_per_dp": 3500}], "version": "0.0.0.dev1+g1e8e0cca2"}, "duration": 549.4739509860519, "completed": 320, "total_input_tokens": 158939, "total_input_text_tokens": 158939, "total_input_vision_tokens": 0, "total_output_tokens": 1301025, "total_output_tokens_retokenized": 1288321, "request_throughput": 0.582375196177631, "input_throughput": 289.256660328989, "output_throughput": 2367.7646550218824, "total_throughput": 2657.0213153508716, "mean_e2e_latency_ms": 96338.26554486295, "median_e2e_latency_ms": 99560.23909756914, "std_e2e_latency_ms": 53588.08448171708, "p90_e2e_latency_ms": 171176.718534017, "p99_e2e_latency_ms": 188599.8856568616, "mean_ttft_ms": 347.4133831434301, "median_ttft_ms": 120.82068901509047, "std_ttft_ms": 461.0518339640887, "p99_ttft_ms": 1505.2862889668907, "mean_tpot_ms": 23.73714903655295, "median_tpot_ms": 24.09194966855164, "std_tpot_ms": 1.1065674013938183, "p99_tpot_ms": 25.431691372342634, "mean_itl_ms": 23.620946834591376, "median_itl_ms": 23.033733014017344, "std_itl_ms": 8.992960795726017, "p95_itl_ms": 23.83332545869052, "p99_itl_ms": 96.43437403254211, "concurrency": 56.10501629610227, "accept_length": null, "max_output_tokens_per_s": 3008.0, "max_concurrent_requests": 68} +{"tag": "flashinfer", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 64, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 8000, "random_range_ratio": 0.0, "server_info": {"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30003, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": null, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 253908945, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "flashinfer", "decode_attention_backend": null, "prefill_attention_backend": null, "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "status": "ready", "max_total_num_tokens": 1386222, "max_req_input_len": 202746, "internal_states": [{"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30003, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": 3500, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 253908945, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "flashinfer", "decode_attention_backend": "flashinfer", "prefill_attention_backend": "flashinfer", "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "use_mla_backend": true, "last_gen_throughput": 114.74399660365336, "memory_usage": {"weight": 56.33, "kvcache": 69.9, "token_capacity": 1386222, "graph": 1.71}, "effective_max_running_requests_per_dp": 3500}], "version": "0.0.0.dev1+g1e8e0cca2"}, "duration": 497.495255718939, "completed": 320, "total_input_tokens": 158939, "total_input_text_tokens": 158939, "total_input_vision_tokens": 0, "total_output_tokens": 1301025, "total_output_tokens_retokenized": 1171209, "request_throughput": 0.6432222143255667, "input_throughput": 319.4784235084101, "output_throughput": 2615.150566852876, "total_throughput": 2934.6289903612865, "mean_e2e_latency_ms": 87646.07776943667, "median_e2e_latency_ms": 90406.89503098838, "std_e2e_latency_ms": 48906.1312695597, "p90_e2e_latency_ms": 156414.02036293878, "p99_e2e_latency_ms": 172093.0460711522, "mean_ttft_ms": 289.28253828853485, "median_ttft_ms": 115.20864104386419, "std_ttft_ms": 356.69460164728656, "p99_ttft_ms": 1172.9512733081356, "mean_tpot_ms": 21.568914790170023, "median_tpot_ms": 21.93780025128241, "std_tpot_ms": 1.0788863103757071, "p99_tpot_ms": 22.959191481056195, "mean_itl_ms": 21.979791418337847, "median_itl_ms": 20.824731094762683, "std_itl_ms": 139.14425778232058, "p95_itl_ms": 22.08241473417729, "p99_itl_ms": 91.44890741910787, "concurrency": 56.37590421980788, "accept_length": null, "max_output_tokens_per_s": 3336.0, "max_concurrent_requests": 68} +{"tag": "lightllm-fa3", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 64, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 8000, "random_range_ratio": 0.0, "server_info": null, "duration": 489.09222918911837, "completed": 320, "total_input_tokens": 158939, "total_input_text_tokens": 158939, "total_input_vision_tokens": 0, "total_output_tokens": 1301025, "total_output_tokens_retokenized": 1300503, "request_throughput": 0.654273326997115, "input_throughput": 324.9673384987327, "output_throughput": 2660.0811101763175, "total_throughput": 2985.0484486750506, "mean_e2e_latency_ms": 87208.0360039814, "median_e2e_latency_ms": 90027.0975221647, "std_e2e_latency_ms": 48689.53844942357, "p90_e2e_latency_ms": 155637.0125748683, "p99_e2e_latency_ms": 171588.75004097354, "mean_ttft_ms": 170.48821544085513, "median_ttft_ms": 121.7766614863649, "std_ttft_ms": 105.2517967311711, "p99_ttft_ms": 400.0959781301208, "mean_tpot_ms": 21.505252597332344, "median_tpot_ms": 21.96794476552389, "std_tpot_ms": 1.1776802400447766, "p99_tpot_ms": 23.08266933652801, "mean_itl_ms": 21.418628915344538, "median_itl_ms": 21.57716895453632, "std_itl_ms": 6.864509437259774, "p95_itl_ms": 22.48724412638694, "p99_itl_ms": 76.17134404601528, "concurrency": 57.05789185720911, "accept_length": null, "max_output_tokens_per_s": 3390.0, "max_concurrent_requests": 68} +{"tag": "lightllm-flashinfer", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 64, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 8000, "random_range_ratio": 0.0, "server_info": null, "duration": 484.7376974809449, "completed": 320, "total_input_tokens": 158939, "total_input_text_tokens": 158939, "total_input_vision_tokens": 0, "total_output_tokens": 1301025, "total_output_tokens_retokenized": 1300473, "request_throughput": 0.6601508437717065, "input_throughput": 327.8866092444727, "output_throughput": 2683.9773484940138, "total_throughput": 3011.8639577384865, "mean_e2e_latency_ms": 86270.7500121207, "median_e2e_latency_ms": 89360.22441205569, "std_e2e_latency_ms": 48155.0270541477, "p90_e2e_latency_ms": 154581.4763627248, "p99_e2e_latency_ms": 169827.765711823, "mean_ttft_ms": 213.335813806043, "median_ttft_ms": 139.8405219661072, "std_ttft_ms": 155.78232331585778, "p99_ttft_ms": 557.104211600963, "mean_tpot_ms": 21.26728059280753, "median_tpot_ms": 21.71698714107755, "std_tpot_ms": 1.217376459708817, "p99_tpot_ms": 23.195757839312797, "mean_itl_ms": 21.17951751529825, "median_itl_ms": 19.90097260568291, "std_itl_ms": 8.868560849765684, "p95_itl_ms": 22.173493867740035, "p99_itl_ms": 94.8113461374305, "concurrency": 56.95170841331944, "accept_length": null, "max_output_tokens_per_s": 3455.0, "max_concurrent_requests": 68} +{"tag": "lightllm-triton", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 64, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 8000, "random_range_ratio": 0.0, "server_info": null, "duration": 760.4980736661237, "completed": 320, "total_input_tokens": 158939, "total_input_text_tokens": 158939, "total_input_vision_tokens": 0, "total_output_tokens": 1301025, "total_output_tokens_retokenized": 1300506, "request_throughput": 0.42077687121202023, "input_throughput": 208.99329729239776, "output_throughput": 1710.7538402144332, "total_throughput": 1919.747137506831, "mean_e2e_latency_ms": 133178.9838919889, "median_e2e_latency_ms": 137120.54603244178, "std_e2e_latency_ms": 75561.65705893908, "p90_e2e_latency_ms": 238436.7804493755, "p99_e2e_latency_ms": 266451.137662374, "mean_ttft_ms": 221.37831723230192, "median_ttft_ms": 122.04821163322777, "std_ttft_ms": 194.03500611102027, "p99_ttft_ms": 679.4678735616617, "mean_tpot_ms": 32.663685042973555, "median_tpot_ms": 33.72062742356595, "std_tpot_ms": 2.9253090523981875, "p99_tpot_ms": 36.29193072595595, "mean_itl_ms": 32.71866235950269, "median_itl_ms": 32.39020751789212, "std_itl_ms": 6.883452147345264, "p95_itl_ms": 36.488556885160506, "p99_itl_ms": 78.23992114746945, "concurrency": 56.038636153267134, "accept_length": null, "max_output_tokens_per_s": 2941.0, "max_concurrent_requests": 67} diff --git a/last_bench/sglang-oai_0123_320_8000_1000.jsonl b/last_bench/sglang-oai_0123_320_8000_1000.jsonl new file mode 100644 index 000000000..58c0b7aab --- /dev/null +++ b/last_bench/sglang-oai_0123_320_8000_1000.jsonl @@ -0,0 +1,7 @@ +{"tag": "triton", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 64, "sharegpt_output_len": null, "random_input_len": 8000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": {"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30000, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": null, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 604505033, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "triton", "decode_attention_backend": null, "prefill_attention_backend": null, "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "status": "ready", "max_total_num_tokens": 1386224, "max_req_input_len": 202746, "internal_states": [{"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30000, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": 3500, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 604505033, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "triton", "decode_attention_backend": "triton", "prefill_attention_backend": "triton", "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": true, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "use_mla_backend": true, "last_gen_throughput": 179.99204346640389, "memory_usage": {"weight": 56.33, "kvcache": 69.9, "token_capacity": 1386224, "graph": 1.69}, "effective_max_running_requests_per_dp": 3500}], "version": "0.0.0.dev1+g1e8e0cca2"}, "duration": 213.6918999250047, "completed": 320, "total_input_tokens": 1273893, "total_input_text_tokens": 1273893, "total_input_vision_tokens": 0, "total_output_tokens": 170000, "total_output_tokens_retokenized": 169253, "request_throughput": 1.4974830590785342, "input_throughput": 5961.353708058535, "output_throughput": 795.5378751354713, "total_throughput": 6756.891583194006, "mean_e2e_latency_ms": 40317.63803735594, "median_e2e_latency_ms": 38796.292787534185, "std_e2e_latency_ms": 20574.065110792944, "p90_e2e_latency_ms": 68181.18443279527, "p99_e2e_latency_ms": 86222.15473645601, "mean_ttft_ms": 3873.7211492632923, "median_ttft_ms": 419.3346749525517, "std_ttft_ms": 7957.863948654817, "p99_ttft_ms": 30295.651319040917, "mean_tpot_ms": 73.12630469464084, "median_tpot_ms": 72.57030575516674, "std_tpot_ms": 30.291116863543795, "p99_tpot_ms": 200.25694315823512, "mean_itl_ms": 68.72528059291209, "median_itl_ms": 40.35702208057046, "std_itl_ms": 317.60260508162276, "p95_itl_ms": 215.18637766130269, "p99_itl_ms": 464.46666873060167, "concurrency": 60.37497994300084, "accept_length": null, "max_output_tokens_per_s": 1664.0, "max_concurrent_requests": 68} +{"tag": "fa3", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 64, "sharegpt_output_len": null, "random_input_len": 8000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": {"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30001, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": null, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 398782077, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "fa3", "decode_attention_backend": null, "prefill_attention_backend": null, "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "status": "ready", "max_total_num_tokens": 1386225, "max_req_input_len": 202746, "internal_states": [{"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30001, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": 3500, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 398782077, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "fa3", "decode_attention_backend": "fa3", "prefill_attention_backend": "fa3", "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "use_mla_backend": true, "last_gen_throughput": 260.871967377065, "memory_usage": {"weight": 56.33, "kvcache": 69.9, "token_capacity": 1386225, "graph": 2.36}, "effective_max_running_requests_per_dp": 3500}], "version": "0.0.0.dev1+g1e8e0cca2"}, "duration": 116.99378763791174, "completed": 320, "total_input_tokens": 1273893, "total_input_text_tokens": 1273893, "total_input_vision_tokens": 0, "total_output_tokens": 170000, "total_output_tokens_retokenized": 169199, "request_throughput": 2.735187965624119, "input_throughput": 10888.552509665018, "output_throughput": 1453.0686067378133, "total_throughput": 12341.621116402832, "mean_e2e_latency_ms": 21713.53677811203, "median_e2e_latency_ms": 20956.929823034443, "std_e2e_latency_ms": 10695.828486107339, "p90_e2e_latency_ms": 36470.67303787918, "p99_e2e_latency_ms": 42708.998922796454, "mean_ttft_ms": 1138.065914130857, "median_ttft_ms": 244.04411506839097, "std_ttft_ms": 2117.0216766259114, "p99_ttft_ms": 8315.621516695246, "mean_tpot_ms": 40.13203695065518, "median_tpot_ms": 40.90892274458055, "std_tpot_ms": 9.072889948810449, "p99_tpot_ms": 70.9889798734412, "mean_itl_ms": 38.8005202036452, "median_itl_ms": 25.11462802067399, "std_itl_ms": 92.39345109467033, "p95_itl_ms": 121.36952648870647, "p99_itl_ms": 196.49121300317347, "concurrency": 59.390604486628725, "accept_length": null, "max_output_tokens_per_s": 2419.0, "max_concurrent_requests": 69} +{"tag": "flashmla", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 64, "sharegpt_output_len": null, "random_input_len": 8000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": {"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30002, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 64, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": null, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 379113072, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "flashmla", "decode_attention_backend": null, "prefill_attention_backend": null, "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "status": "ready", "max_total_num_tokens": 1386176, "max_req_input_len": 202746, "internal_states": [{"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30002, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 64, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": 3500, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 379113072, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "flashmla", "decode_attention_backend": "flashmla", "prefill_attention_backend": "flashmla", "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "use_mla_backend": true, "last_gen_throughput": 233.5999098891422, "memory_usage": {"weight": 56.33, "kvcache": 69.9, "token_capacity": 1386176, "graph": 1.21}, "effective_max_running_requests_per_dp": 3500}], "version": "0.0.0.dev1+g1e8e0cca2"}, "duration": 112.81122839893214, "completed": 320, "total_input_tokens": 1273893, "total_input_text_tokens": 1273893, "total_input_vision_tokens": 0, "total_output_tokens": 170000, "total_output_tokens_retokenized": 169153, "request_throughput": 2.8365970705361905, "input_throughput": 11292.253599926747, "output_throughput": 1506.942193722351, "total_throughput": 12799.195793649098, "mean_e2e_latency_ms": 20836.65436528754, "median_e2e_latency_ms": 20054.032715968788, "std_e2e_latency_ms": 10206.070387118045, "p90_e2e_latency_ms": 34869.302074844025, "p99_e2e_latency_ms": 41031.26083256677, "mean_ttft_ms": 1201.1233022982196, "median_ttft_ms": 222.6223434554413, "std_ttft_ms": 2285.9580876221967, "p99_ttft_ms": 8968.076999972109, "mean_tpot_ms": 38.38958789982542, "median_tpot_ms": 38.97802759513901, "std_tpot_ms": 9.349859411662857, "p99_tpot_ms": 72.9707564608667, "mean_itl_ms": 37.02905762269816, "median_itl_ms": 23.81886588409543, "std_itl_ms": 96.84834802975762, "p95_itl_ms": 109.55824161646888, "p99_itl_ms": 181.73167313216254, "concurrency": 59.105192732349764, "accept_length": null, "max_output_tokens_per_s": 2627.0, "max_concurrent_requests": 71} +{"tag": "flashinfer", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 64, "sharegpt_output_len": null, "random_input_len": 8000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": {"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30003, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": null, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 253908945, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "flashinfer", "decode_attention_backend": null, "prefill_attention_backend": null, "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "status": "ready", "max_total_num_tokens": 1386222, "max_req_input_len": 202746, "internal_states": [{"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30003, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": 3500, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 253908945, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "flashinfer", "decode_attention_backend": "flashinfer", "prefill_attention_backend": "flashinfer", "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "use_mla_backend": true, "last_gen_throughput": 274.27057526732784, "memory_usage": {"weight": 56.33, "kvcache": 69.9, "token_capacity": 1386222, "graph": 1.71}, "effective_max_running_requests_per_dp": 3500}], "version": "0.0.0.dev1+g1e8e0cca2"}, "duration": 112.2128552980721, "completed": 320, "total_input_tokens": 1273893, "total_input_text_tokens": 1273893, "total_input_vision_tokens": 0, "total_output_tokens": 170000, "total_output_tokens_retokenized": 156092, "request_throughput": 2.8517231751208976, "input_throughput": 11352.469346013393, "output_throughput": 1514.977936782977, "total_throughput": 12867.44728279637, "mean_e2e_latency_ms": 20905.081391092244, "median_e2e_latency_ms": 20195.792312850244, "std_e2e_latency_ms": 10355.055639883163, "p90_e2e_latency_ms": 35295.89922316374, "p99_e2e_latency_ms": 41340.240367348306, "mean_ttft_ms": 1119.6136578902951, "median_ttft_ms": 241.6940564289689, "std_ttft_ms": 2078.1090631361863, "p99_ttft_ms": 8247.429187672678, "mean_tpot_ms": 38.66073936228574, "median_tpot_ms": 39.597893979087445, "std_tpot_ms": 9.283534476702123, "p99_tpot_ms": 70.24860901496342, "mean_itl_ms": 37.540270646483414, "median_itl_ms": 23.099064477719367, "std_itl_ms": 111.93775803445361, "p95_itl_ms": 118.23046104982495, "p99_itl_ms": 195.68518341518939, "concurrency": 59.61550508076637, "accept_length": null, "max_output_tokens_per_s": 2595.0, "max_concurrent_requests": 69} +{"tag": "lightllm-fa3", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 64, "sharegpt_output_len": null, "random_input_len": 8000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": null, "duration": 85.61151024489664, "completed": 320, "total_input_tokens": 1273893, "total_input_text_tokens": 1273893, "total_input_vision_tokens": 0, "total_output_tokens": 170000, "total_output_tokens_retokenized": 169953, "request_throughput": 3.737815149909418, "input_throughput": 14879.926733636119, "output_throughput": 1985.7142983893784, "total_throughput": 16865.641032025498, "mean_e2e_latency_ms": 15726.555139116681, "median_e2e_latency_ms": 15938.618876039982, "std_e2e_latency_ms": 8319.184834233261, "p90_e2e_latency_ms": 26862.930231867365, "p99_e2e_latency_ms": 32213.170791727025, "mean_ttft_ms": 277.19335837537074, "median_ttft_ms": 209.07252049073577, "std_ttft_ms": 191.42876900594652, "p99_ttft_ms": 673.9466302050278, "mean_tpot_ms": 29.049475895770772, "median_tpot_ms": 31.37292958098005, "std_tpot_ms": 4.868888682964287, "p99_tpot_ms": 35.96265073154, "mean_itl_ms": 29.144233108824857, "median_itl_ms": 21.852319943718612, "std_itl_ms": 28.374227292396913, "p95_itl_ms": 81.86526254285126, "p99_itl_ms": 147.18675724463537, "concurrency": 58.782956054876145, "accept_length": null, "max_output_tokens_per_s": 2941.0, "max_concurrent_requests": 71} +{"tag": "lightllm-flashinfer", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 64, "sharegpt_output_len": null, "random_input_len": 8000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": null, "duration": 86.55054738209583, "completed": 320, "total_input_tokens": 1273893, "total_input_text_tokens": 1273893, "total_input_vision_tokens": 0, "total_output_tokens": 170000, "total_output_tokens_retokenized": 169856, "request_throughput": 3.697261423284729, "input_throughput": 14718.485769663917, "output_throughput": 1964.1701311200122, "total_throughput": 16682.655900783928, "mean_e2e_latency_ms": 15904.437752439117, "median_e2e_latency_ms": 16162.462773616426, "std_e2e_latency_ms": 8427.974845182198, "p90_e2e_latency_ms": 27384.40113137476, "p99_e2e_latency_ms": 32415.15740617411, "mean_ttft_ms": 273.31787959701614, "median_ttft_ms": 202.60415493976325, "std_ttft_ms": 191.3256412390505, "p99_ttft_ms": 717.0455491333269, "mean_tpot_ms": 29.321787822406232, "median_tpot_ms": 31.71495196018244, "std_tpot_ms": 5.006722794831577, "p99_tpot_ms": 35.875299398168835, "mean_itl_ms": 29.487281758663784, "median_itl_ms": 21.87118213623762, "std_itl_ms": 26.885584181443708, "p95_itl_ms": 99.2627366213128, "p99_itl_ms": 134.71757725346833, "concurrency": 58.80286416112642, "accept_length": null, "max_output_tokens_per_s": 3059.0, "max_concurrent_requests": 71} +{"tag": "lightllm-triton", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 64, "sharegpt_output_len": null, "random_input_len": 8000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": null, "duration": 135.46758720418438, "completed": 320, "total_input_tokens": 1273893, "total_input_text_tokens": 1273893, "total_input_vision_tokens": 0, "total_output_tokens": 170000, "total_output_tokens_retokenized": 169774, "request_throughput": 2.3621886726134567, "input_throughput": 9403.67379600492, "output_throughput": 1254.9127323258988, "total_throughput": 10658.586528330818, "mean_e2e_latency_ms": 24741.570841200155, "median_e2e_latency_ms": 25289.254195871763, "std_e2e_latency_ms": 13030.010406347368, "p90_e2e_latency_ms": 42505.82551364787, "p99_e2e_latency_ms": 50194.15757432813, "mean_ttft_ms": 313.1668246234767, "median_ttft_ms": 267.09420257247984, "std_ttft_ms": 190.20965705094397, "p99_ttft_ms": 717.6051298505627, "mean_tpot_ms": 46.051251688612794, "median_tpot_ms": 48.8459376461558, "std_tpot_ms": 6.861405373140807, "p99_tpot_ms": 54.66476245289881, "mean_itl_ms": 46.08479912195559, "median_itl_ms": 37.94613853096962, "std_itl_ms": 35.57256149381119, "p95_itl_ms": 113.83506678976119, "p99_itl_ms": 199.79378876043484, "concurrency": 58.4442583837464, "accept_length": null, "max_output_tokens_per_s": 1783.0, "max_concurrent_requests": 70} diff --git a/last_bench/sglang-oai_0123_80_1000_1000.jsonl b/last_bench/sglang-oai_0123_80_1000_1000.jsonl new file mode 100644 index 000000000..1063a8aa5 --- /dev/null +++ b/last_bench/sglang-oai_0123_80_1000_1000.jsonl @@ -0,0 +1,7 @@ +{"tag": "triton", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 16, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": null, "duration": 58.75812066299841, "completed": 80, "total_input_tokens": 39668, "total_input_text_tokens": 39668, "total_input_vision_tokens": 0, "total_output_tokens": 40805, "total_output_tokens_retokenized": 40774, "request_throughput": 1.3615139336881172, "input_throughput": 675.1066840192528, "output_throughput": 694.4572008017952, "total_throughput": 1369.563884821048, "mean_e2e_latency_ms": 10290.420460538007, "median_e2e_latency_ms": 10854.726839112118, "std_e2e_latency_ms": 5481.416227557806, "p90_e2e_latency_ms": 17087.210976914506, "p99_e2e_latency_ms": 20273.29670665319, "mean_ttft_ms": 392.1325499599334, "median_ttft_ms": 109.52406388241798, "std_ttft_ms": 567.0151769289985, "p99_ttft_ms": 1561.811125462409, "mean_tpot_ms": 19.978452692457562, "median_tpot_ms": 19.987008761459922, "std_tpot_ms": 3.9156782596281814, "p99_tpot_ms": 28.60846174028575, "mean_itl_ms": 19.448439157011858, "median_itl_ms": 18.109396449290216, "std_itl_ms": 11.359371226695139, "p95_itl_ms": 19.791446393355727, "p99_itl_ms": 90.75973493745548, "concurrency": 14.010550840531788, "accept_length": null, "max_output_tokens_per_s": 896.0, "max_concurrent_requests": 21} +{"tag": "fa3", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 16, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": null, "duration": 53.40713605610654, "completed": 80, "total_input_tokens": 39668, "total_input_text_tokens": 39668, "total_input_vision_tokens": 0, "total_output_tokens": 40805, "total_output_tokens_retokenized": 40773, "request_throughput": 1.4979271668107514, "input_throughput": 742.7471856631112, "output_throughput": 764.036475521409, "total_throughput": 1506.78366118452, "mean_e2e_latency_ms": 9317.695084243314, "median_e2e_latency_ms": 10052.179563441314, "std_e2e_latency_ms": 5047.758937948556, "p90_e2e_latency_ms": 15426.76037205384, "p99_e2e_latency_ms": 18604.811251361385, "mean_ttft_ms": 153.06907128542662, "median_ttft_ms": 107.1817415067926, "std_ttft_ms": 91.4799906514664, "p99_ttft_ms": 337.4836889700964, "mean_tpot_ms": 18.252970567128944, "median_tpot_ms": 18.4298531433687, "std_tpot_ms": 1.4893520951460373, "p99_tpot_ms": 21.899579410787737, "mean_itl_ms": 18.004708145534345, "median_itl_ms": 16.874348046258092, "std_itl_ms": 10.496979411646434, "p95_itl_ms": 17.423551063984632, "p99_itl_ms": 88.60507588833578, "concurrency": 13.957228598747054, "accept_length": null, "max_output_tokens_per_s": 976.0, "max_concurrent_requests": 21} +{"tag": "flashmla", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 16, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": null, "duration": 55.16790589294396, "completed": 80, "total_input_tokens": 39668, "total_input_text_tokens": 39668, "total_input_vision_tokens": 0, "total_output_tokens": 40805, "total_output_tokens_retokenized": 40786, "request_throughput": 1.4501184829317961, "input_throughput": 719.0412497617311, "output_throughput": 739.6510587003993, "total_throughput": 1458.6923084621305, "mean_e2e_latency_ms": 9602.716599009, "median_e2e_latency_ms": 10392.598490929231, "std_e2e_latency_ms": 5178.017009753717, "p90_e2e_latency_ms": 15825.043794442907, "p99_e2e_latency_ms": 19127.044549903363, "mean_ttft_ms": 184.86261386133265, "median_ttft_ms": 114.34125760570168, "std_ttft_ms": 122.27272272576788, "p99_ttft_ms": 429.63656508596614, "mean_tpot_ms": 18.84696523455597, "median_tpot_ms": 19.015752477440632, "std_tpot_ms": 1.875303441368142, "p99_tpot_ms": 25.919375823585423, "mean_itl_ms": 18.50310546566746, "median_itl_ms": 16.952354926615953, "std_itl_ms": 12.890325841392592, "p95_itl_ms": 17.591408849693835, "p99_itl_ms": 94.11955502349883, "concurrency": 13.925076826578909, "accept_length": null, "max_output_tokens_per_s": 976.0, "max_concurrent_requests": 20} +{"tag": "flashinfer", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 16, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": null, "duration": 47.810525686945766, "completed": 80, "total_input_tokens": 39668, "total_input_text_tokens": 39668, "total_input_vision_tokens": 0, "total_output_tokens": 40805, "total_output_tokens_retokenized": 37815, "request_throughput": 1.6732717084899837, "input_throughput": 829.6917766547584, "output_throughput": 853.4731508116723, "total_throughput": 1683.1649274664308, "mean_e2e_latency_ms": 8313.95380628528, "median_e2e_latency_ms": 8911.577637074515, "std_e2e_latency_ms": 4493.426788691228, "p90_e2e_latency_ms": 13624.376828689134, "p99_e2e_latency_ms": 16482.360729521602, "mean_ttft_ms": 166.6187765513314, "median_ttft_ms": 107.28733043652028, "std_ttft_ms": 107.45469984138889, "p99_ttft_ms": 383.4053916204721, "mean_tpot_ms": 16.219748328613264, "median_tpot_ms": 16.46422699847982, "std_tpot_ms": 1.3754006608474518, "p99_tpot_ms": 20.223300155676718, "mean_itl_ms": 16.117470535603395, "median_itl_ms": 14.493613503873348, "std_itl_ms": 23.759563258631825, "p95_itl_ms": 15.231409226544201, "p99_itl_ms": 91.41340630594641, "concurrency": 13.911503689749775, "accept_length": null, "max_output_tokens_per_s": 1120.0, "max_concurrent_requests": 21} +{"tag": "lightllm-fa3", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 16, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": null, "duration": 39.198021210031584, "completed": 80, "total_input_tokens": 39668, "total_input_text_tokens": 39668, "total_input_vision_tokens": 0, "total_output_tokens": 40805, "total_output_tokens_retokenized": 40779, "request_throughput": 2.0409193507841246, "input_throughput": 1011.9898600863081, "output_throughput": 1040.9964263593274, "total_throughput": 2052.9862864456354, "mean_e2e_latency_ms": 6749.5986932568485, "median_e2e_latency_ms": 7180.660139070824, "std_e2e_latency_ms": 3674.73758483094, "p90_e2e_latency_ms": 11592.498308001086, "p99_e2e_latency_ms": 13352.02830265276, "mean_ttft_ms": 65.81717804365326, "median_ttft_ms": 65.59701240621507, "std_ttft_ms": 16.49426206282749, "p99_ttft_ms": 103.18048950051889, "mean_tpot_ms": 13.250411406873202, "median_tpot_ms": 13.433356619192136, "std_tpot_ms": 0.6795819356745668, "p99_tpot_ms": 13.671014163799812, "mean_itl_ms": 13.132804988486683, "median_itl_ms": 12.713762000203133, "std_itl_ms": 1.5002947899704477, "p95_itl_ms": 14.877910260111094, "p99_itl_ms": 15.338663104921581, "concurrency": 13.775386583095141, "accept_length": null, "max_output_tokens_per_s": 1232.0, "max_concurrent_requests": 22} +{"tag": "lightllm-flashinfer", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 16, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": null, "duration": 38.999191141920164, "completed": 80, "total_input_tokens": 39668, "total_input_text_tokens": 39668, "total_input_vision_tokens": 0, "total_output_tokens": 40805, "total_output_tokens_retokenized": 40769, "request_throughput": 2.0513245956531683, "input_throughput": 1017.1493007546235, "output_throughput": 1046.303751570344, "total_throughput": 2063.4530523249678, "mean_e2e_latency_ms": 6713.866910606157, "median_e2e_latency_ms": 7081.338235875592, "std_e2e_latency_ms": 3648.244918498425, "p90_e2e_latency_ms": 11412.26524873637, "p99_e2e_latency_ms": 13317.006446453743, "mean_ttft_ms": 63.56341727369, "median_ttft_ms": 54.98786806128919, "std_ttft_ms": 18.798902727973804, "p99_ttft_ms": 125.0289089186117, "mean_tpot_ms": 13.195385200587856, "median_tpot_ms": 13.332349463977804, "std_tpot_ms": 0.6546048286199364, "p99_tpot_ms": 13.744833133619448, "mean_itl_ms": 13.070555955399977, "median_itl_ms": 12.730133486911654, "std_itl_ms": 1.5181346036760426, "p95_itl_ms": 14.938965067267418, "p99_itl_ms": 15.352271823212506, "concurrency": 13.77232032566836, "accept_length": null, "max_output_tokens_per_s": 1248.0, "max_concurrent_requests": 21} +{"tag": "lightllm-triton", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 16, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": null, "duration": 46.33363204495981, "completed": 80, "total_input_tokens": 39668, "total_input_text_tokens": 39668, "total_input_vision_tokens": 0, "total_output_tokens": 40805, "total_output_tokens_retokenized": 40771, "request_throughput": 1.7266075735735986, "input_throughput": 856.1383653564689, "output_throughput": 880.6777754958837, "total_throughput": 1736.8161408523526, "mean_e2e_latency_ms": 8104.25450251787, "median_e2e_latency_ms": 8717.964101932012, "std_e2e_latency_ms": 4437.408088077443, "p90_e2e_latency_ms": 13669.85152466223, "p99_e2e_latency_ms": 16101.117293201383, "mean_ttft_ms": 67.62399404251482, "median_ttft_ms": 64.90619748365134, "std_ttft_ms": 15.426835098115847, "p99_ttft_ms": 97.62805712874979, "mean_tpot_ms": 15.921614299470969, "median_tpot_ms": 16.282790201281642, "std_tpot_ms": 1.0562744090505234, "p99_tpot_ms": 16.695227297923605, "mean_itl_ms": 15.791763753396411, "median_itl_ms": 16.721565974876285, "std_itl_ms": 1.887789638138326, "p95_itl_ms": 17.124590510502458, "p99_itl_ms": 18.57489356771111, "concurrency": 13.99286720221529, "accept_length": null, "max_output_tokens_per_s": 1040.0, "max_concurrent_requests": 21} diff --git a/last_bench/sglang-oai_0123_80_1000_8000.jsonl b/last_bench/sglang-oai_0123_80_1000_8000.jsonl new file mode 100644 index 000000000..28b40d7f4 --- /dev/null +++ b/last_bench/sglang-oai_0123_80_1000_8000.jsonl @@ -0,0 +1,7 @@ +{"tag": "triton", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 16, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 8000, "random_range_ratio": 0.0, "server_info": {"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30000, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": null, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 604505033, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "triton", "decode_attention_backend": null, "prefill_attention_backend": null, "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "status": "ready", "max_total_num_tokens": 1386224, "max_req_input_len": 202746, "internal_states": [{"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30000, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": 3500, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 604505033, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "triton", "decode_attention_backend": "triton", "prefill_attention_backend": "triton", "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": true, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "use_mla_backend": true, "last_gen_throughput": 65.3115398409416, "memory_usage": {"weight": 56.33, "kvcache": 69.9, "token_capacity": 1386224, "graph": 1.69}, "effective_max_running_requests_per_dp": 3500}], "version": "0.0.0.dev1+g1e8e0cca2"}, "duration": 510.3684002109803, "completed": 80, "total_input_tokens": 39668, "total_input_text_tokens": 39668, "total_input_vision_tokens": 0, "total_output_tokens": 318306, "total_output_tokens_retokenized": 318138, "request_throughput": 0.1567495165588797, "input_throughput": 77.7242477857205, "output_throughput": 623.6788952223845, "total_throughput": 701.403143008105, "mean_e2e_latency_ms": 88432.47628718382, "median_e2e_latency_ms": 85325.70422510616, "std_e2e_latency_ms": 53979.79939917359, "p90_e2e_latency_ms": 167731.33851240855, "p99_e2e_latency_ms": 178339.03011296637, "mean_ttft_ms": 250.5359776376281, "median_ttft_ms": 117.73683200590312, "std_ttft_ms": 279.6251500610817, "p99_ttft_ms": 880.0148263876326, "mean_tpot_ms": 22.046003086600507, "median_tpot_ms": 22.13179399591731, "std_tpot_ms": 1.311077596299769, "p99_tpot_ms": 23.973088838541113, "mean_itl_ms": 22.17242102734307, "median_itl_ms": 22.142937988974154, "std_itl_ms": 4.26715641637369, "p95_itl_ms": 24.86142996931448, "p99_itl_ms": 27.21198833314702, "concurrency": 13.861747906120655, "accept_length": null, "max_output_tokens_per_s": 929.0, "max_concurrent_requests": 18} +{"tag": "fa3", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 16, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 8000, "random_range_ratio": 0.0, "server_info": {"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30001, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": null, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 398782077, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "fa3", "decode_attention_backend": null, "prefill_attention_backend": null, "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "status": "ready", "max_total_num_tokens": 1386225, "max_req_input_len": 202746, "internal_states": [{"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30001, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": 3500, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 398782077, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "fa3", "decode_attention_backend": "fa3", "prefill_attention_backend": "fa3", "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "use_mla_backend": true, "last_gen_throughput": 111.70549511187465, "memory_usage": {"weight": 56.33, "kvcache": 69.9, "token_capacity": 1386225, "graph": 2.36}, "effective_max_running_requests_per_dp": 3500}], "version": "0.0.0.dev1+g1e8e0cca2"}, "duration": 378.9256413059775, "completed": 80, "total_input_tokens": 39668, "total_input_text_tokens": 39668, "total_input_vision_tokens": 0, "total_output_tokens": 318306, "total_output_tokens_retokenized": 318087, "request_throughput": 0.21112321595413241, "input_throughput": 104.68544663085656, "output_throughput": 840.0223297187009, "total_throughput": 944.7077763495574, "mean_e2e_latency_ms": 66920.71866179758, "median_e2e_latency_ms": 67326.38371211942, "std_e2e_latency_ms": 40074.098965241894, "p90_e2e_latency_ms": 125597.93171191122, "p99_e2e_latency_ms": 134578.84816802104, "mean_ttft_ms": 214.4721156655578, "median_ttft_ms": 109.45712006650865, "std_ttft_ms": 219.49495984278872, "p99_ttft_ms": 720.4752121167257, "mean_tpot_ms": 16.863758946605724, "median_tpot_ms": 17.003689203956057, "std_tpot_ms": 0.6634634044921125, "p99_tpot_ms": 17.452971537097646, "mean_itl_ms": 16.771740067354198, "median_itl_ms": 16.822382458485663, "std_itl_ms": 4.257912709980094, "p95_itl_ms": 17.406025959644467, "p99_itl_ms": 17.852395898662504, "concurrency": 14.12851733784043, "accept_length": null, "max_output_tokens_per_s": 1008.0, "max_concurrent_requests": 19} +{"tag": "flashmla", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 16, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 8000, "random_range_ratio": 0.0, "server_info": {"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30002, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 64, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": null, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 379113072, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "flashmla", "decode_attention_backend": null, "prefill_attention_backend": null, "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "status": "ready", "max_total_num_tokens": 1386176, "max_req_input_len": 202746, "internal_states": [{"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30002, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 64, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": 3500, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 379113072, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "flashmla", "decode_attention_backend": "flashmla", "prefill_attention_backend": "flashmla", "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "use_mla_backend": true, "last_gen_throughput": 88.10241504348274, "memory_usage": {"weight": 56.33, "kvcache": 69.9, "token_capacity": 1386176, "graph": 1.21}, "effective_max_running_requests_per_dp": 3500}], "version": "0.0.0.dev1+g1e8e0cca2"}, "duration": 388.088292311877, "completed": 80, "total_input_tokens": 39668, "total_input_text_tokens": 39668, "total_input_vision_tokens": 0, "total_output_tokens": 318306, "total_output_tokens_retokenized": 318054, "request_throughput": 0.20613865861150507, "input_throughput": 102.21385387251479, "output_throughput": 820.1896483499216, "total_throughput": 922.4035022224364, "mean_e2e_latency_ms": 67931.14652846998, "median_e2e_latency_ms": 68187.61040759273, "std_e2e_latency_ms": 40723.917366219466, "p90_e2e_latency_ms": 127324.08863757737, "p99_e2e_latency_ms": 136953.1988685066, "mean_ttft_ms": 294.5981682394631, "median_ttft_ms": 111.34997848421335, "std_ttft_ms": 363.3717001890983, "p99_ttft_ms": 1089.565416739788, "mean_tpot_ms": 17.089183333079028, "median_tpot_ms": 17.174373370728233, "std_tpot_ms": 0.5925423207604643, "p99_tpot_ms": 17.82896875321477, "mean_itl_ms": 17.006230745068496, "median_itl_ms": 16.955296974629164, "std_itl_ms": 4.835724451681868, "p95_itl_ms": 17.620283225551248, "p99_itl_ms": 18.00625145435333, "concurrency": 14.003235423320401, "accept_length": null, "max_output_tokens_per_s": 992.0, "max_concurrent_requests": 19} +{"tag": "flashinfer", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 16, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 8000, "random_range_ratio": 0.0, "server_info": {"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30003, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": null, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 253908945, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "flashinfer", "decode_attention_backend": null, "prefill_attention_backend": null, "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "status": "ready", "max_total_num_tokens": 1386222, "max_req_input_len": 202746, "internal_states": [{"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30003, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": 3500, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 253908945, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "flashinfer", "decode_attention_backend": "flashinfer", "prefill_attention_backend": "flashinfer", "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "use_mla_backend": true, "last_gen_throughput": 115.96867184133801, "memory_usage": {"weight": 56.33, "kvcache": 69.9, "token_capacity": 1386222, "graph": 1.71}, "effective_max_running_requests_per_dp": 3500}], "version": "0.0.0.dev1+g1e8e0cca2"}, "duration": 336.1375814990606, "completed": 80, "total_input_tokens": 39668, "total_input_text_tokens": 39668, "total_input_vision_tokens": 0, "total_output_tokens": 318306, "total_output_tokens_retokenized": 289408, "request_throughput": 0.2379977854401965, "input_throughput": 118.01120191052145, "output_throughput": 946.95153865409, "total_throughput": 1064.9627405646113, "mean_e2e_latency_ms": 59016.91139079048, "median_e2e_latency_ms": 59690.270767663606, "std_e2e_latency_ms": 35464.544102828455, "p90_e2e_latency_ms": 110727.35556068365, "p99_e2e_latency_ms": 119681.62642030511, "mean_ttft_ms": 214.80026175267994, "median_ttft_ms": 109.64243847411126, "std_ttft_ms": 214.38356439875133, "p99_ttft_ms": 738.7441637879238, "mean_tpot_ms": 14.840095201035492, "median_tpot_ms": 14.978818821285847, "std_tpot_ms": 0.49894420942441764, "p99_tpot_ms": 15.18216584803879, "mean_itl_ms": 14.98574704604075, "median_itl_ms": 14.707728987559676, "std_itl_ms": 41.27780934363587, "p95_itl_ms": 15.214697923511267, "p99_itl_ms": 15.75610991567373, "concurrency": 14.045894214528444, "accept_length": null, "max_output_tokens_per_s": 1120.0, "max_concurrent_requests": 18} +{"tag": "lightllm-fa3", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 16, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 8000, "random_range_ratio": 0.0, "server_info": null, "duration": 301.9964598421939, "completed": 80, "total_input_tokens": 39668, "total_input_text_tokens": 39668, "total_input_vision_tokens": 0, "total_output_tokens": 318306, "total_output_tokens_retokenized": 315950, "request_throughput": 0.26490376755344563, "input_throughput": 131.352533141376, "output_throughput": 1054.0057329358383, "total_throughput": 1185.3582660772142, "mean_e2e_latency_ms": 53554.92747395474, "median_e2e_latency_ms": 54525.81842453219, "std_e2e_latency_ms": 31906.4896132971, "p90_e2e_latency_ms": 98788.09053299484, "p99_e2e_latency_ms": 110253.29957607435, "mean_ttft_ms": 468.15795370785054, "median_ttft_ms": 129.87286353018135, "std_ttft_ms": 722.0932614354268, "p99_ttft_ms": 2356.574236305896, "mean_tpot_ms": 13.43742028334658, "median_tpot_ms": 13.60711470533312, "std_tpot_ms": 0.6681434813613275, "p99_tpot_ms": 14.401517221818835, "mean_itl_ms": 13.34789935342274, "median_itl_ms": 12.691037962213159, "std_itl_ms": 7.594706030906152, "p95_itl_ms": 14.856397034600377, "p99_itl_ms": 15.24846603162586, "concurrency": 14.186902058902145, "accept_length": null, "max_output_tokens_per_s": 1248.0, "max_concurrent_requests": 18} +{"tag": "lightllm-flashinfer", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 16, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 8000, "random_range_ratio": 0.0, "server_info": null, "duration": 302.9897459298372, "completed": 80, "total_input_tokens": 39668, "total_input_text_tokens": 39668, "total_input_vision_tokens": 0, "total_output_tokens": 318306, "total_output_tokens_retokenized": 318107, "request_throughput": 0.26403533807551843, "input_throughput": 130.9219223847458, "output_throughput": 1050.5504040183246, "total_throughput": 1181.4723264030704, "mean_e2e_latency_ms": 53567.94852038438, "median_e2e_latency_ms": 54704.7826530179, "std_e2e_latency_ms": 32153.43528106655, "p90_e2e_latency_ms": 99366.40834102874, "p99_e2e_latency_ms": 109649.55897029256, "mean_ttft_ms": 206.28674371691886, "median_ttft_ms": 133.14349250867963, "std_ttft_ms": 166.94021679176737, "p99_ttft_ms": 701.204480342567, "mean_tpot_ms": 13.495440161882424, "median_tpot_ms": 13.699334770739123, "std_tpot_ms": 0.6245435190179904, "p99_tpot_ms": 14.051371312114576, "mean_itl_ms": 13.423134283187423, "median_itl_ms": 12.703799526207149, "std_itl_ms": 5.127250146021068, "p95_itl_ms": 14.870735118165612, "p99_itl_ms": 15.211140103638169, "concurrency": 14.143831397591656, "accept_length": null, "max_output_tokens_per_s": 1246.0, "max_concurrent_requests": 18} +{"tag": "lightllm-triton", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 16, "sharegpt_output_len": null, "random_input_len": 1000, "random_output_len": 8000, "random_range_ratio": 0.0, "server_info": null, "duration": 469.9071873531211, "completed": 80, "total_input_tokens": 39668, "total_input_text_tokens": 39668, "total_input_vision_tokens": 0, "total_output_tokens": 318306, "total_output_tokens_retokenized": 318227, "request_throughput": 0.17024638514388674, "input_throughput": 84.41667007359624, "output_throughput": 677.3805733701251, "total_throughput": 761.7972434437214, "mean_e2e_latency_ms": 80378.25429544318, "median_e2e_latency_ms": 79052.61100351345, "std_e2e_latency_ms": 49463.25117134018, "p90_e2e_latency_ms": 152817.94724198992, "p99_e2e_latency_ms": 160306.36509028258, "mean_ttft_ms": 167.8032555442769, "median_ttft_ms": 133.42473388183862, "std_ttft_ms": 93.26758803871225, "p99_ttft_ms": 393.4040538291447, "mean_tpot_ms": 19.99498991112318, "median_tpot_ms": 20.358366379219007, "std_tpot_ms": 1.171219880118829, "p99_tpot_ms": 21.51192666823354, "mean_itl_ms": 20.179870766167, "median_itl_ms": 20.82169963978231, "std_itl_ms": 5.298449338488774, "p95_itl_ms": 22.915064496919513, "p99_itl_ms": 23.15957001177594, "concurrency": 13.684107237975288, "accept_length": null, "max_output_tokens_per_s": 1040.0, "max_concurrent_requests": 18} diff --git a/last_bench/sglang-oai_0123_80_8000_1000.jsonl b/last_bench/sglang-oai_0123_80_8000_1000.jsonl new file mode 100644 index 000000000..ec5185eb6 --- /dev/null +++ b/last_bench/sglang-oai_0123_80_8000_1000.jsonl @@ -0,0 +1,7 @@ +{"tag": "triton", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 16, "sharegpt_output_len": null, "random_input_len": 8000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": {"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30000, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": null, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 604505033, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "triton", "decode_attention_backend": null, "prefill_attention_backend": null, "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "status": "ready", "max_total_num_tokens": 1386224, "max_req_input_len": 202746, "internal_states": [{"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30000, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": 3500, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 604505033, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "triton", "decode_attention_backend": "triton", "prefill_attention_backend": "triton", "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": true, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "use_mla_backend": true, "last_gen_throughput": 151.9219302705729, "memory_usage": {"weight": 56.33, "kvcache": 69.9, "token_capacity": 1386224, "graph": 1.69}, "effective_max_running_requests_per_dp": 3500}], "version": "0.0.0.dev1+g1e8e0cca2"}, "duration": 91.72738593397662, "completed": 80, "total_input_tokens": 300020, "total_input_text_tokens": 300020, "total_input_vision_tokens": 0, "total_output_tokens": 41669, "total_output_tokens_retokenized": 41658, "request_throughput": 0.8721495678246218, "input_throughput": 3270.778916734288, "output_throughput": 454.2700042710521, "total_throughput": 3725.0489210053397, "mean_e2e_latency_ms": 16465.93430028588, "median_e2e_latency_ms": 16535.203597974032, "std_e2e_latency_ms": 8297.40087689733, "p90_e2e_latency_ms": 27848.679464147426, "p99_e2e_latency_ms": 34066.74822968196, "mean_ttft_ms": 1275.6713679729728, "median_ttft_ms": 316.7236299486831, "std_ttft_ms": 2237.0240657685317, "p99_ttft_ms": 8538.692513904534, "mean_tpot_ms": 31.521857190271305, "median_tpot_ms": 29.72083288174972, "std_tpot_ms": 17.82936716609273, "p99_tpot_ms": 77.68743860196824, "mean_itl_ms": 29.22327457401753, "median_itl_ms": 23.241017595864832, "std_itl_ms": 87.96533202479867, "p95_itl_ms": 24.558546382468194, "p99_itl_ms": 199.04860019916651, "concurrency": 14.360757483822947, "accept_length": null, "max_output_tokens_per_s": 720.0, "max_concurrent_requests": 19} +{"tag": "fa3", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 16, "sharegpt_output_len": null, "random_input_len": 8000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": {"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30001, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": null, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 398782077, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "fa3", "decode_attention_backend": null, "prefill_attention_backend": null, "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "status": "ready", "max_total_num_tokens": 1386225, "max_req_input_len": 202746, "internal_states": [{"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30001, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": 3500, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 398782077, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "fa3", "decode_attention_backend": "fa3", "prefill_attention_backend": "fa3", "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "use_mla_backend": true, "last_gen_throughput": 219.95093668460453, "memory_usage": {"weight": 56.33, "kvcache": 69.9, "token_capacity": 1386225, "graph": 2.36}, "effective_max_running_requests_per_dp": 3500}], "version": "0.0.0.dev1+g1e8e0cca2"}, "duration": 59.73137983889319, "completed": 80, "total_input_tokens": 300020, "total_input_text_tokens": 300020, "total_input_vision_tokens": 0, "total_output_tokens": 41669, "total_output_tokens_retokenized": 41662, "request_throughput": 1.3393295151689968, "input_throughput": 5022.82051426253, "output_throughput": 697.6065195947116, "total_throughput": 5720.427033857241, "mean_e2e_latency_ms": 10622.85218659672, "median_e2e_latency_ms": 11066.243920009583, "std_e2e_latency_ms": 5402.915248171756, "p90_e2e_latency_ms": 17654.418413643725, "p99_e2e_latency_ms": 20623.906321420785, "mean_ttft_ms": 451.66798550635576, "median_ttft_ms": 203.69619643315673, "std_ttft_ms": 607.1807883246723, "p99_ttft_ms": 2349.38725421438, "mean_tpot_ms": 20.256926417193917, "median_tpot_ms": 20.214214266303014, "std_tpot_ms": 4.371429622338443, "p99_tpot_ms": 35.06027535497477, "mean_itl_ms": 19.5660749396803, "median_itl_ms": 16.61551301367581, "std_itl_ms": 26.438658107068314, "p95_itl_ms": 17.541494616307315, "p99_itl_ms": 110.56301650125533, "concurrency": 14.227499468786501, "accept_length": null, "max_output_tokens_per_s": 992.0, "max_concurrent_requests": 20} +{"tag": "flashmla", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 16, "sharegpt_output_len": null, "random_input_len": 8000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": {"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30002, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 64, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": null, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 379113072, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "flashmla", "decode_attention_backend": null, "prefill_attention_backend": null, "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "status": "ready", "max_total_num_tokens": 1386176, "max_req_input_len": 202746, "internal_states": [{"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30002, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 64, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": 3500, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 379113072, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "flashmla", "decode_attention_backend": "flashmla", "prefill_attention_backend": "flashmla", "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "use_mla_backend": true, "last_gen_throughput": 206.06980700252126, "memory_usage": {"weight": 56.33, "kvcache": 69.9, "token_capacity": 1386176, "graph": 1.21}, "effective_max_running_requests_per_dp": 3500}], "version": "0.0.0.dev1+g1e8e0cca2"}, "duration": 61.09140785806812, "completed": 80, "total_input_tokens": 300020, "total_input_text_tokens": 300020, "total_input_vision_tokens": 0, "total_output_tokens": 41669, "total_output_tokens_retokenized": 41654, "request_throughput": 1.3095131182090558, "input_throughput": 4911.001571563511, "output_throughput": 682.0762765331643, "total_throughput": 5593.077848096676, "mean_e2e_latency_ms": 10810.300631891005, "median_e2e_latency_ms": 11076.29925746005, "std_e2e_latency_ms": 5473.510942669237, "p90_e2e_latency_ms": 17724.158798204742, "p99_e2e_latency_ms": 21265.637267699916, "mean_ttft_ms": 537.9770307714352, "median_ttft_ms": 201.4428024413064, "std_ttft_ms": 779.0185306377831, "p99_ttft_ms": 2850.2713821735233, "mean_tpot_ms": 20.48185703899536, "median_tpot_ms": 20.238185403098377, "std_tpot_ms": 4.742075680759459, "p99_tpot_ms": 35.28982754592937, "mean_itl_ms": 19.764911403941884, "median_itl_ms": 16.86290360521525, "std_itl_ms": 27.32488423973038, "p95_itl_ms": 17.988642130512744, "p99_itl_ms": 103.51196154253557, "concurrency": 14.156230489244916, "accept_length": null, "max_output_tokens_per_s": 960.0, "max_concurrent_requests": 20} +{"tag": "flashinfer", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 16, "sharegpt_output_len": null, "random_input_len": 8000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": {"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30003, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": null, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 253908945, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "flashinfer", "decode_attention_backend": null, "prefill_attention_backend": null, "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "status": "ready", "max_total_num_tokens": 1386222, "max_req_input_len": 202746, "internal_states": [{"model_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_path": "/dev/shm/GLM-4.7-Flash/", "tokenizer_mode": "auto", "tokenizer_worker_num": 1, "skip_tokenizer_init": false, "load_format": "auto", "model_loader_extra_config": "{}", "trust_remote_code": false, "context_length": null, "is_embedding": false, "enable_multimodal": null, "revision": null, "model_impl": "auto", "host": "127.0.0.1", "port": 30003, "fastapi_root_path": "", "grpc_mode": false, "skip_server_warmup": false, "warmups": null, "nccl_port": null, "checkpoint_engine_wait_weights_before_ready": false, "dtype": "auto", "quantization": null, "quantization_param_path": null, "kv_cache_dtype": "auto", "enable_fp32_lm_head": false, "modelopt_quant": null, "modelopt_checkpoint_restore_path": null, "modelopt_checkpoint_save_path": null, "modelopt_export_path": null, "quantize_and_serve": false, "rl_quant_profile": null, "mem_fraction_static": 0.907, "max_running_requests": null, "max_queued_requests": null, "max_total_tokens": null, "chunked_prefill_size": 8192, "enable_dynamic_chunking": false, "max_prefill_tokens": 16384, "prefill_max_requests": null, "schedule_policy": "fcfs", "enable_priority_scheduling": false, "abort_on_priority_when_disabled": false, "schedule_low_priority_values_first": false, "priority_scheduling_preemption_threshold": 10, "schedule_conservativeness": 1.0, "page_size": 1, "swa_full_tokens_ratio": 0.8, "disable_hybrid_swa_memory": false, "radix_eviction_policy": "lru", "enable_prefill_delayer": false, "prefill_delayer_max_delay_passes": 30, "prefill_delayer_token_usage_low_watermark": null, "prefill_delayer_forward_passes_buckets": null, "prefill_delayer_wait_seconds_buckets": null, "device": "cuda", "tp_size": 1, "pp_size": 1, "pp_max_micro_batch_size": 3500, "pp_async_batch_depth": 0, "stream_interval": 1, "stream_output": false, "random_seed": 253908945, "constrained_json_whitespace_pattern": null, "constrained_json_disable_any_whitespace": false, "watchdog_timeout": 300, "soft_watchdog_timeout": null, "dist_timeout": null, "download_dir": null, "model_checksum": null, "base_gpu_id": 0, "gpu_id_step": 1, "sleep_on_idle": false, "custom_sigquit_handler": null, "log_level": "info", "log_level_http": null, "log_requests": false, "log_requests_level": 2, "log_requests_format": "text", "log_requests_target": null, "uvicorn_access_log_exclude_prefixes": [], "crash_dump_folder": null, "show_time_cost": false, "enable_metrics": false, "enable_metrics_for_all_schedulers": false, "tokenizer_metrics_custom_labels_header": "x-custom-labels", "tokenizer_metrics_allowed_custom_labels": null, "bucket_time_to_first_token": null, "bucket_inter_token_latency": null, "bucket_e2e_request_latency": null, "collect_tokens_histogram": false, "prompt_tokens_buckets": null, "generation_tokens_buckets": null, "gc_warning_threshold_secs": 0.0, "decode_log_interval": 40, "enable_request_time_stats_logging": false, "kv_events_config": null, "enable_trace": false, "otlp_traces_endpoint": "localhost:4317", "export_metrics_to_file": false, "export_metrics_to_file_dir": null, "api_key": null, "admin_api_key": null, "served_model_name": "/dev/shm/GLM-4.7-Flash/", "weight_version": "default", "chat_template": null, "hf_chat_template_name": null, "completion_template": null, "file_storage_path": "sglang_storage", "enable_cache_report": false, "reasoning_parser": null, "tool_call_parser": null, "tool_server": null, "sampling_defaults": "model", "dp_size": 1, "load_balance_method": "round_robin", "dist_init_addr": null, "nnodes": 1, "node_rank": 0, "json_model_override_args": "{}", "preferred_sampling_params": null, "enable_lora": null, "enable_lora_overlap_loading": null, "max_lora_rank": null, "lora_target_modules": null, "lora_paths": null, "max_loaded_loras": null, "max_loras_per_batch": 8, "lora_eviction_policy": "lru", "lora_backend": "csgmv", "max_lora_chunk_size": 16, "attention_backend": "flashinfer", "decode_attention_backend": "flashinfer", "prefill_attention_backend": "flashinfer", "sampling_backend": "flashinfer", "grammar_backend": "xgrammar", "mm_attention_backend": null, "fp8_gemm_runner_backend": "auto", "fp4_gemm_runner_backend": "auto", "nsa_prefill_backend": "flashmla_sparse", "nsa_decode_backend": "fa3", "disable_flashinfer_autotune": false, "speculative_algorithm": null, "speculative_draft_model_path": null, "speculative_draft_model_revision": null, "speculative_draft_load_format": null, "speculative_num_steps": null, "speculative_eagle_topk": null, "speculative_num_draft_tokens": null, "speculative_accept_threshold_single": 1.0, "speculative_accept_threshold_acc": 1.0, "speculative_token_map": null, "speculative_attention_mode": "prefill", "speculative_draft_attention_backend": null, "speculative_moe_runner_backend": "auto", "speculative_moe_a2a_backend": null, "speculative_draft_model_quantization": null, "speculative_ngram_min_match_window_size": 1, "speculative_ngram_max_match_window_size": 12, "speculative_ngram_min_bfs_breadth": 1, "speculative_ngram_max_bfs_breadth": 10, "speculative_ngram_match_type": "BFS", "speculative_ngram_branch_length": 18, "speculative_ngram_capacity": 10000000, "enable_multi_layer_eagle": false, "ep_size": 1, "moe_a2a_backend": "none", "moe_runner_backend": "auto", "flashinfer_mxfp4_moe_precision": "default", "enable_flashinfer_allreduce_fusion": true, "deepep_mode": "auto", "ep_num_redundant_experts": 0, "ep_dispatch_algorithm": null, "init_expert_location": "trivial", "enable_eplb": false, "eplb_algorithm": "auto", "eplb_rebalance_num_iterations": 1000, "eplb_rebalance_layers_per_chunk": null, "eplb_min_rebalancing_utilization_threshold": 1.0, "expert_distribution_recorder_mode": null, "expert_distribution_recorder_buffer_size": 1000, "enable_expert_distribution_metrics": false, "deepep_config": null, "moe_dense_tp_size": null, "elastic_ep_backend": null, "mooncake_ib_device": null, "max_mamba_cache_size": null, "mamba_ssm_dtype": "float32", "mamba_full_memory_ratio": 0.9, "mamba_scheduler_strategy": "no_buffer", "mamba_track_interval": 256, "enable_hierarchical_cache": false, "hicache_ratio": 2.0, "hicache_size": 0, "hicache_write_policy": "write_through", "hicache_io_backend": "kernel", "hicache_mem_layout": "layer_first", "disable_hicache_numa_detect": false, "hicache_storage_backend": null, "hicache_storage_prefetch_policy": "best_effort", "hicache_storage_backend_extra_config": null, "hierarchical_sparse_attention_extra_config": null, "enable_lmcache": false, "kt_weight_path": null, "kt_method": "AMXINT4", "kt_cpuinfer": null, "kt_threadpool_count": 2, "kt_num_gpu_experts": null, "kt_max_deferred_experts_per_token": null, "dllm_algorithm": null, "dllm_algorithm_config": null, "enable_double_sparsity": false, "ds_channel_config_path": null, "ds_heavy_channel_num": 32, "ds_heavy_token_num": 256, "ds_heavy_channel_type": "qk", "ds_sparse_decode_threshold": 4096, "cpu_offload_gb": 0, "offload_group_size": -1, "offload_num_in_group": 1, "offload_prefetch_step": 1, "offload_mode": "cpu", "multi_item_scoring_delimiter": null, "disable_radix_cache": false, "cuda_graph_max_bs": 256, "cuda_graph_bs": [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], "disable_cuda_graph": false, "disable_cuda_graph_padding": false, "enable_profile_cuda_graph": false, "enable_cudagraph_gc": false, "enable_layerwise_nvtx_marker": false, "enable_nccl_nvls": false, "enable_symm_mem": false, "disable_flashinfer_cutlass_moe_fp4_allgather": false, "enable_tokenizer_batch_encode": false, "disable_tokenizer_batch_decode": false, "disable_outlines_disk_cache": false, "disable_custom_all_reduce": false, "enable_mscclpp": false, "enable_torch_symm_mem": false, "disable_overlap_schedule": false, "enable_mixed_chunk": false, "enable_dp_attention": false, "enable_dp_lm_head": false, "enable_two_batch_overlap": false, "enable_single_batch_overlap": false, "tbo_token_distribution_threshold": 0.48, "enable_torch_compile": false, "enable_piecewise_cuda_graph": false, "enable_torch_compile_debug_mode": false, "torch_compile_max_bs": 32, "piecewise_cuda_graph_max_tokens": 2048, "piecewise_cuda_graph_tokens": [4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048], "piecewise_cuda_graph_compiler": "eager", "torchao_config": "", "enable_nan_detection": false, "enable_p2p_check": false, "triton_attention_reduce_in_fp32": false, "triton_attention_num_kv_splits": 8, "triton_attention_split_tile_size": null, "num_continuous_decode_steps": 1, "delete_ckpt_after_loading": false, "enable_memory_saver": false, "enable_weights_cpu_backup": false, "enable_draft_weights_cpu_backup": false, "allow_auto_truncate": false, "enable_custom_logit_processor": false, "flashinfer_mla_disable_ragged": false, "disable_shared_experts_fusion": false, "disable_chunked_prefix_cache": false, "disable_fast_image_processor": false, "keep_mm_feature_on_device": false, "enable_return_hidden_states": false, "enable_return_routed_experts": false, "scheduler_recv_interval": 1, "numa_node": null, "enable_deterministic_inference": false, "rl_on_policy_target": null, "enable_attn_tp_input_scattered": false, "enable_nsa_prefill_context_parallel": false, "nsa_prefill_cp_mode": "in-seq-split", "enable_fused_qk_norm_rope": false, "enable_precise_embedding_interpolation": false, "enable_dynamic_batch_tokenizer": false, "dynamic_batch_tokenizer_batch_size": 32, "dynamic_batch_tokenizer_batch_timeout": 0.002, "debug_tensor_dump_output_folder": null, "debug_tensor_dump_layers": null, "debug_tensor_dump_input_file": null, "debug_tensor_dump_inject": false, "disaggregation_mode": "null", "disaggregation_transfer_backend": "mooncake", "disaggregation_bootstrap_port": 8998, "disaggregation_decode_tp": null, "disaggregation_decode_dp": null, "disaggregation_prefill_pp": 1, "disaggregation_ib_device": null, "disaggregation_decode_enable_offload_kvcache": false, "disaggregation_decode_enable_fake_auto": false, "num_reserved_decode_tokens": 512, "disaggregation_decode_polling_interval": 1, "encoder_only": false, "language_only": false, "encoder_transfer_backend": "zmq_to_scheduler", "encoder_urls": [], "custom_weight_loader": [], "weight_loader_disable_mmap": false, "remote_instance_weight_loader_seed_instance_ip": null, "remote_instance_weight_loader_seed_instance_service_port": null, "remote_instance_weight_loader_send_weights_group_ports": null, "remote_instance_weight_loader_backend": "nccl", "remote_instance_weight_loader_start_seed_via_transfer_engine": false, "enable_pdmux": false, "pdmux_config_path": null, "sm_group_num": 8, "mm_max_concurrent_calls": 32, "mm_per_request_timeout": 10.0, "enable_broadcast_mm_inputs_process": false, "enable_prefix_mm_cache": false, "mm_enable_dp_encoder": false, "mm_process_config": {}, "limit_mm_data_per_request": null, "decrypted_config_file": null, "decrypted_draft_config_file": null, "forward_hooks": null, "use_mla_backend": true, "last_gen_throughput": 230.46929758756366, "memory_usage": {"weight": 56.33, "kvcache": 69.9, "token_capacity": 1386222, "graph": 1.71}, "effective_max_running_requests_per_dp": 3500}], "version": "0.0.0.dev1+g1e8e0cca2"}, "duration": 55.89285264792852, "completed": 80, "total_input_tokens": 300020, "total_input_text_tokens": 300020, "total_input_vision_tokens": 0, "total_output_tokens": 41669, "total_output_tokens_retokenized": 38512, "request_throughput": 1.431310019259948, "input_throughput": 5367.770399729619, "output_throughput": 745.5157149067846, "total_throughput": 6113.286114636404, "mean_e2e_latency_ms": 9957.88358815189, "median_e2e_latency_ms": 10363.739449880086, "std_e2e_latency_ms": 5054.131718005112, "p90_e2e_latency_ms": 16646.5964271687, "p99_e2e_latency_ms": 19347.230267827385, "mean_ttft_ms": 453.599761048099, "median_ttft_ms": 213.0170369055122, "std_ttft_ms": 603.1003102286579, "p99_ttft_ms": 2351.627971509006, "mean_tpot_ms": 18.996200609382797, "median_tpot_ms": 18.850297664384062, "std_tpot_ms": 4.495230873739031, "p99_tpot_ms": 34.1858196159775, "mean_itl_ms": 18.34537334671199, "median_itl_ms": 15.283621032722294, "std_itl_ms": 27.258801329256364, "p95_itl_ms": 16.486559237819165, "p99_itl_ms": 109.11040107021108, "concurrency": 14.252818550346, "accept_length": null, "max_output_tokens_per_s": 1056.0, "max_concurrent_requests": 20} +{"tag": "lightllm-fa3", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 16, "sharegpt_output_len": null, "random_input_len": 8000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": null, "duration": 45.33212866913527, "completed": 80, "total_input_tokens": 300020, "total_input_text_tokens": 300020, "total_input_vision_tokens": 0, "total_output_tokens": 41669, "total_output_tokens_retokenized": 41658, "request_throughput": 1.7647527779666923, "input_throughput": 6618.264105569588, "output_throughput": 919.1935438136762, "total_throughput": 7537.457649383264, "mean_e2e_latency_ms": 8019.016220018965, "median_e2e_latency_ms": 8524.129228899255, "std_e2e_latency_ms": 4157.89820587007, "p90_e2e_latency_ms": 13101.259714760823, "p99_e2e_latency_ms": 15792.665833330246, "mean_ttft_ms": 181.4006418280769, "median_ttft_ms": 156.01945854723454, "std_ttft_ms": 135.5525971806731, "p99_ttft_ms": 734.0356547082774, "mean_tpot_ms": 15.553320464290449, "median_tpot_ms": 15.566042172658063, "std_tpot_ms": 2.911541114674202, "p99_tpot_ms": 26.047497648062944, "mean_itl_ms": 15.084295362151842, "median_itl_ms": 12.69083796069026, "std_itl_ms": 13.922606183367431, "p95_itl_ms": 15.18232161179185, "p99_itl_ms": 88.5000554844737, "concurrency": 14.151581150838433, "accept_length": null, "max_output_tokens_per_s": 1232.0, "max_concurrent_requests": 21} +{"tag": "lightllm-flashinfer", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 16, "sharegpt_output_len": null, "random_input_len": 8000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": null, "duration": 45.5442786780186, "completed": 80, "total_input_tokens": 300020, "total_input_text_tokens": 300020, "total_input_vision_tokens": 0, "total_output_tokens": 41669, "total_output_tokens_retokenized": 41662, "request_throughput": 1.7565323751325772, "input_throughput": 6587.435539840947, "output_throughput": 914.911844242492, "total_throughput": 7502.3473840834395, "mean_e2e_latency_ms": 8048.459070990793, "median_e2e_latency_ms": 8570.676261559129, "std_e2e_latency_ms": 4178.828855557125, "p90_e2e_latency_ms": 13119.105676165787, "p99_e2e_latency_ms": 15836.62676514126, "mean_ttft_ms": 184.9294237967115, "median_ttft_ms": 149.6679214760661, "std_ttft_ms": 134.0790601278319, "p99_ttft_ms": 661.3055070326664, "mean_tpot_ms": 15.603147434568124, "median_tpot_ms": 15.65682640210949, "std_tpot_ms": 2.6834146850235157, "p99_tpot_ms": 25.564559472766067, "mean_itl_ms": 15.133441347685872, "median_itl_ms": 12.702923035249114, "std_itl_ms": 13.846450207312868, "p95_itl_ms": 15.275433706119653, "p99_itl_ms": 93.04464281536639, "concurrency": 14.137378928124793, "accept_length": null, "max_output_tokens_per_s": 1248.0, "max_concurrent_requests": 21} +{"tag": "lightllm-triton", "backend": "sglang-oai", "dataset_name": "random", "request_rate": Infinity, "max_concurrency": 16, "sharegpt_output_len": null, "random_input_len": 8000, "random_output_len": 1000, "random_range_ratio": 0.0, "server_info": null, "duration": 71.63242872082628, "completed": 80, "total_input_tokens": 300020, "total_input_text_tokens": 300020, "total_input_vision_tokens": 0, "total_output_tokens": 41669, "total_output_tokens_retokenized": 41654, "request_throughput": 1.1168126144624346, "input_throughput": 4188.326507387746, "output_throughput": 581.7058104004399, "total_throughput": 4770.0323177881855, "mean_e2e_latency_ms": 12452.863597802934, "median_e2e_latency_ms": 13016.981037915684, "std_e2e_latency_ms": 6482.27810358746, "p90_e2e_latency_ms": 20901.55612407253, "p99_e2e_latency_ms": 24121.33116378681, "mean_ttft_ms": 225.4112859343877, "median_ttft_ms": 181.95387651212513, "std_ttft_ms": 187.30133292464953, "p99_ttft_ms": 973.6838864628226, "mean_tpot_ms": 24.17535566961863, "median_tpot_ms": 24.092253083753874, "std_tpot_ms": 4.128153213237987, "p99_tpot_ms": 38.4425003147593, "mean_itl_ms": 23.532990351701816, "median_itl_ms": 20.95877891406417, "std_itl_ms": 17.612692753270967, "p95_itl_ms": 23.275281325913966, "p99_itl_ms": 105.4545730212703, "concurrency": 13.907515152206376, "accept_length": null, "max_output_tokens_per_s": 782.0, "max_concurrent_requests": 20} diff --git a/lightllm/common/basemodel/attention/flashinfer/mla.py b/lightllm/common/basemodel/attention/flashinfer/mla.py index 6c74b22e1..537dbee22 100644 --- a/lightllm/common/basemodel/attention/flashinfer/mla.py +++ b/lightllm/common/basemodel/attention/flashinfer/mla.py @@ -16,6 +16,8 @@ def __init__(self, model): self.qk_nope_head_dim = model.qk_nope_head_dim self.qk_rope_head_dim = model.qk_rope_head_dim self.kv_lora_rank = model.kv_lora_rank + # v_head_dim may differ from qk_nope_head_dim (e.g., GLM-4.7-Flash: v_head_dim=256, qk_nope_head_dim=192) + self.v_head_dim = getattr(model, "v_head_dim", self.qk_nope_head_dim) self.q_data_type = model.data_type self.kv_data_type = model.data_type self.workspace_buffer = torch.empty(256 * 1024 * 1024, dtype=torch.int8, device=get_current_device_id()) @@ -69,7 +71,7 @@ def init_state(self): num_qo_heads=self.backend.tp_q_head_num, num_kv_heads=self.backend.tp_q_head_num, head_dim_qk=self.backend.qk_nope_head_dim + self.backend.qk_rope_head_dim, - head_dim_vo=self.backend.qk_nope_head_dim, + head_dim_vo=self.backend.v_head_dim, # Use v_head_dim, not qk_nope_head_dim q_data_type=self.backend.q_data_type, causal=True, sm_scale=self.backend.softmax_scale, @@ -101,7 +103,8 @@ def _mla_prefill_att( ) -> torch.Tensor: self.backend: MlaFlashInferAttBackend = self.backend # for typing k_nope, k_rope = k - o_tensor = alloc_func((q.shape[0], q.shape[1], k_nope.shape[2]), q.dtype, device="cuda") + # Output dimension is v_head_dim (from v.shape[-1]), not qk_nope_head_dim + o_tensor = alloc_func((q.shape[0], q.shape[1], v.shape[-1]), q.dtype, device="cuda") q_head_num = q.shape[1] k = torch.cat([k_nope, torch.repeat_interleave(k_rope, q_head_num, dim=-2)], dim=-1) self.prefill_wrapper.run(q, k, v, out=o_tensor) diff --git a/lightllm/common/basemodel/attention/triton/mla.py b/lightllm/common/basemodel/attention/triton/mla.py index 8288193ad..fbdae4012 100644 --- a/lightllm/common/basemodel/attention/triton/mla.py +++ b/lightllm/common/basemodel/attention/triton/mla.py @@ -44,7 +44,8 @@ def _mla_prefill_att( qk_rope_head_dim = 64 q_nope, q_rope = q[:, :, :-qk_rope_head_dim], q[:, :, -qk_rope_head_dim:] - o_tensor = alloc_func(q_nope.shape, dtype=q_nope.dtype, device=q.device) + # GLM-4.7-Flash : v_head_dim != qk_nope_head_dim + o_tensor = alloc_func((q_nope.shape[0], q_nope.shape[1], v.shape[-1]), dtype=q_nope.dtype, device=q.device) k_nope, k_rope = k assert att_control.mla_prefill softmax_scale = att_control.mla_prefill_dict["softmax_scale"] diff --git a/lightllm/common/basemodel/basemodel.py b/lightllm/common/basemodel/basemodel.py index 26d51af3d..7967004a3 100755 --- a/lightllm/common/basemodel/basemodel.py +++ b/lightllm/common/basemodel/basemodel.py @@ -1028,6 +1028,7 @@ def _gen_special_model_input(self, token_num: int): "Deepseek3MTPModel" in str(self.__class__) or "Qwen3MOEMTPModel" in str(self.__class__) or "MistralMTPModel" in str(self.__class__) + or "Glm4MoeLiteMTPModel" in str(self.__class__) ) if is_mtp_draft_model: special_model_input["mtp_draft_input_hiddens"] = torch.randn( diff --git a/lightllm/common/basemodel/triton_kernel/att/decode_att/gqa/flash_decoding/gqa_flash_decoding_vsm.py b/lightllm/common/basemodel/triton_kernel/att/decode_att/gqa/flash_decoding/gqa_flash_decoding_vsm.py index 6a9bb79c7..141587ff3 100644 --- a/lightllm/common/basemodel/triton_kernel/att/decode_att/gqa/flash_decoding/gqa_flash_decoding_vsm.py +++ b/lightllm/common/basemodel/triton_kernel/att/decode_att/gqa/flash_decoding/gqa_flash_decoding_vsm.py @@ -81,8 +81,11 @@ def _fwd_kernel_calcu_index_and_block_seq( vsm_count, batch_size, BLOCK_N: tl.constexpr, + MAX_BATCH_SIZE: tl.constexpr, ): - b_seq_len = tl.load(b_seq_len + tl.arange(0, 2048), mask=tl.arange(0, 2048) < batch_size, other=0) + b_seq_len = tl.load( + b_seq_len + tl.arange(0, MAX_BATCH_SIZE), mask=tl.arange(0, MAX_BATCH_SIZE) < batch_size, other=0 + ) total_token_num = tl.sum(b_seq_len) block_seq = tl.cdiv(total_token_num, vsm_count * 4) @@ -93,9 +96,9 @@ def _fwd_kernel_calcu_index_and_block_seq( cumsum_seq_len = tl.cumsum(block_seq_len) batch_start_index = cumsum_seq_len - block_seq_len tl.store( - mid_o_batch_start_index + tl.arange(0, 2048), + mid_o_batch_start_index + tl.arange(0, MAX_BATCH_SIZE), batch_start_index, - mask=tl.arange(0, 2048) < batch_size, + mask=tl.arange(0, MAX_BATCH_SIZE) < batch_size, ) tl.store(mid_o_decode_att_block_seq, block_seq) @@ -455,7 +458,6 @@ def gqa_token_decode_attention_flash_decoding_vsm( ) if not hasattr(infer_state, "decode_att_block_seq"): - assert batch_size <= 2048 decode_att_block_seq = torch.empty( [ 1, @@ -477,6 +479,7 @@ def gqa_token_decode_attention_flash_decoding_vsm( num_vsm, batch_size, BLOCK_N=run_config["BLOCK_N"], + MAX_BATCH_SIZE=triton.next_power_of_2(batch_size), num_warps=4, ) diff --git a/lightllm/common/basemodel/triton_kernel/mla_att/decode_att/gqa_flash_decoding.py b/lightllm/common/basemodel/triton_kernel/mla_att/decode_att/gqa_flash_decoding.py index 28839b5f5..063181d99 100644 --- a/lightllm/common/basemodel/triton_kernel/mla_att/decode_att/gqa_flash_decoding.py +++ b/lightllm/common/basemodel/triton_kernel/mla_att/decode_att/gqa_flash_decoding.py @@ -67,7 +67,6 @@ def gqa_token_decode_attention_flash_decoding( ) if not hasattr(infer_state, "decode_att_block_seq"): - assert batch_size <= 2048 decode_att_block_seq = torch.empty( [ 1, @@ -89,6 +88,7 @@ def gqa_token_decode_attention_flash_decoding( vsm_count, batch_size, BLOCK_N=BLOCK_N, + MAX_BATCH_SIZE=triton.next_power_of_2(batch_size), num_warps=4, ) @@ -134,8 +134,11 @@ def _fwd_kernel_calcu_index_and_block_seq( num_sm, batch_size, BLOCK_N: tl.constexpr, + MAX_BATCH_SIZE: tl.constexpr, ): - b_seq_len = tl.load(b_seq_len_ptr + tl.arange(0, 2048), mask=tl.arange(0, 2048) < batch_size, other=0) + b_seq_len = tl.load( + b_seq_len_ptr + tl.arange(0, MAX_BATCH_SIZE), mask=tl.arange(0, MAX_BATCH_SIZE) < batch_size, other=0 + ) total_token_num = tl.sum(b_seq_len) block_seq = tl.cast(total_token_num / (num_sm * 4), dtype=tl.int32) + 1 @@ -144,6 +147,10 @@ def _fwd_kernel_calcu_index_and_block_seq( block_seq_len = tl.cdiv(b_seq_len, block_seq) cumsum_seq_len = tl.cumsum(block_seq_len) batch_start_index = cumsum_seq_len - block_seq_len - tl.store(mid_o_batch_start_index_ptr + tl.arange(0, 2048), batch_start_index, mask=tl.arange(0, 2048) < batch_size) + tl.store( + mid_o_batch_start_index_ptr + tl.arange(0, MAX_BATCH_SIZE), + batch_start_index, + mask=tl.arange(0, MAX_BATCH_SIZE) < batch_size, + ) tl.store(mid_o_decode_att_block_seq_ptr, block_seq) return diff --git a/lightllm/common/basemodel/triton_kernel/mla_att/prefill_att/context_flashattention_nopad_with_v.py b/lightllm/common/basemodel/triton_kernel/mla_att/prefill_att/context_flashattention_nopad_with_v.py index be0635182..d79020844 100644 --- a/lightllm/common/basemodel/triton_kernel/mla_att/prefill_att/context_flashattention_nopad_with_v.py +++ b/lightllm/common/basemodel/triton_kernel/mla_att/prefill_att/context_flashattention_nopad_with_v.py @@ -36,6 +36,9 @@ def _fwd_kernel_with_v( BLOCK_DMODEL: tl.constexpr, BLOCK_ROPE_DMODEL: tl.constexpr, BLOCK_N: tl.constexpr, + BLOCK_V_DMODEL: tl.constexpr, + ACTUAL_DMODEL: tl.constexpr, + ACTUAL_V_DMODEL: tl.constexpr, ): cur_batch = tl.program_id(0) cur_head = tl.program_id(1) @@ -53,8 +56,13 @@ def _fwd_kernel_with_v( # initialize offsets offs_n = tl.arange(0, BLOCK_N) offs_d = tl.arange(0, BLOCK_DMODEL) + offs_v_d = tl.arange(0, BLOCK_V_DMODEL) offs_rope_d = tl.arange(0, BLOCK_ROPE_DMODEL) offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M) + + d_mask = offs_d < ACTUAL_DMODEL + v_d_mask = offs_v_d < ACTUAL_V_DMODEL + off_q = (cur_batch_in_q_start_index + offs_m[:, None]) * stride_q_bs + cur_head * stride_q_h + offs_d[None, :] off_q_rope = ( (cur_batch_in_q_start_index + offs_m[:, None]) * stride_q_rope_bs @@ -63,9 +71,10 @@ def _fwd_kernel_with_v( ) off_k = offs_n[None, :] * stride_k_bs + cur_k_head * stride_k_h + offs_d[:, None] off_k_rope = offs_n[None, :] * stride_k_rope_bs + offs_rope_d[:, None] - off_v = offs_n[:, None] * stride_vbs + cur_k_head * stride_vh + offs_d[None, :] + off_v = offs_n[:, None] * stride_vbs + cur_k_head * stride_vh + offs_v_d[None, :] - q = tl.load(Q_nope + off_q, mask=offs_m[:, None] < cur_batch_seq_len, other=0.0) + q_mask = (offs_m[:, None] < cur_batch_seq_len) & d_mask[None, :] + q = tl.load(Q_nope + off_q, mask=q_mask, other=0.0) q_rope = tl.load(Q_rope + off_q_rope, mask=offs_m[:, None] < cur_batch_seq_len, other=0.0) k_ptrs = K_nope + off_k @@ -75,7 +84,7 @@ def _fwd_kernel_with_v( # initialize pointer to m and l m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf") l_i = tl.zeros([BLOCK_M], dtype=tl.float32) - acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32) + acc = tl.zeros([BLOCK_M, BLOCK_V_DMODEL], dtype=tl.float32) block_mask = tl.where(block_start_loc < cur_batch_seq_len, 1, 0) block_end_loc = tl.minimum((start_m + 1) * BLOCK_M + prompt_cache_len, cur_batch_seq_len + prompt_cache_len) @@ -83,14 +92,16 @@ def _fwd_kernel_with_v( for start_n in range(0, block_mask * block_end_loc, BLOCK_N): start_n = tl.multiple_of(start_n, BLOCK_N) # -- compute qk ---- + k_seq_mask = (start_n + offs_n[None, :]) < block_end_loc + k_mask = k_seq_mask & d_mask[:, None] k = tl.load( k_ptrs + (cur_batch_in_kv_start_index + start_n) * stride_k_bs, - mask=(start_n + offs_n[None, :]) < block_end_loc, + mask=k_mask, other=0.0, ) k_rope = tl.load( k_rope_ptrs + (cur_batch_in_kv_start_index + start_n) * stride_k_rope_bs, - mask=(start_n + offs_n[None, :]) < block_end_loc, + mask=k_seq_mask, other=0.0, ) @@ -112,9 +123,11 @@ def _fwd_kernel_with_v( # -- update output accumulator -- acc = acc * alpha[:, None] # update acc + v_seq_mask = (start_n + offs_n[:, None]) < block_end_loc + v_mask = v_seq_mask & v_d_mask[None, :] v = tl.load( v_ptrs + (cur_batch_in_kv_start_index + start_n) * stride_vbs, - mask=(start_n + offs_n[:, None]) < block_end_loc, + mask=v_mask, other=0.0, ) p = p.to(v.dtype) @@ -124,9 +137,10 @@ def _fwd_kernel_with_v( acc = acc / l_i[:, None] # initialize pointers to output - off_o = (cur_batch_in_q_start_index + offs_m[:, None]) * stride_obs + cur_head * stride_oh + offs_d[None, :] + off_o = (cur_batch_in_q_start_index + offs_m[:, None]) * stride_obs + cur_head * stride_oh + offs_v_d[None, :] out_ptrs = Out + off_o - tl.store(out_ptrs, acc, mask=offs_m[:, None] < cur_batch_seq_len) + o_mask = (offs_m[:, None] < cur_batch_seq_len) & v_d_mask[None, :] + tl.store(out_ptrs, acc, mask=o_mask) return @@ -149,13 +163,14 @@ def context_attention_fwd_with_v( BLOCK = 128 if not is_tesla() else 64 q_nope_dim = q_nope.shape[-1] q_rope_dim = q_rope.shape[-1] + v_dim = v.shape[-1] assert q_nope_dim == k_nope.shape[-1] assert q_rope_dim == k_rope.shape[-1] - assert q_nope_dim in {16, 32, 64, 128, 256, 512} - assert q_rope_dim in {16, 32, 64, 128, 256} - assert q_nope_dim == v.shape[-1] - if q_nope_dim >= 512: + q_nope_dim_padded = triton.next_power_of_2(q_nope_dim) + v_dim_padded = triton.next_power_of_2(v_dim) + + if q_nope_dim_padded >= 512 or v_dim_padded >= 512: BLOCK = 64 if not is_tesla() else 32 else: BLOCK = 128 if not is_tesla() else 64 @@ -167,7 +182,7 @@ def context_attention_fwd_with_v( batch, head = b_seq_len.shape[0], q_nope.shape[1] grid = (batch, head, triton.cdiv(max_input_len, BLOCK)) # batch, head, - num_warps = 4 if q_nope_dim <= 64 else 8 + num_warps = 4 if q_nope_dim_padded <= 64 else 8 _fwd_kernel_with_v[grid]( q_nope, @@ -194,9 +209,12 @@ def context_attention_fwd_with_v( o.stride(1), b_prompt_cache_len=b_prompt_cache_len, BLOCK_M=BLOCK, - BLOCK_DMODEL=q_nope_dim, + BLOCK_DMODEL=q_nope_dim_padded, BLOCK_ROPE_DMODEL=q_rope_dim, BLOCK_N=BLOCK, + BLOCK_V_DMODEL=v_dim_padded, + ACTUAL_DMODEL=q_nope_dim, + ACTUAL_V_DMODEL=v_dim, num_warps=num_warps, num_stages=1, ) diff --git a/lightllm/common/fused_moe/grouped_topk.py b/lightllm/common/fused_moe/grouped_topk.py index fb0323cd4..2687adf14 100644 --- a/lightllm/common/fused_moe/grouped_topk.py +++ b/lightllm/common/fused_moe/grouped_topk.py @@ -227,7 +227,7 @@ def triton_grouped_topk( scores_buffer = torch.empty((token_num, total_expert_num), dtype=dtype, device="cuda") out_topk_weights = torch.empty((token_num, topk), dtype=torch.float32, device="cuda") - out_topk_ids = torch.empty((token_num, topk), dtype=torch.long, device="cuda") + out_topk_ids = torch.empty((token_num, topk), dtype=torch.int32, device="cuda") assert total_expert_num % num_expert_group == 0 diff --git a/lightllm/common/fused_moe/topk_select.py b/lightllm/common/fused_moe/topk_select.py index 5206800ef..a51ab3d03 100644 --- a/lightllm/common/fused_moe/topk_select.py +++ b/lightllm/common/fused_moe/topk_select.py @@ -196,10 +196,12 @@ def select_experts( scoring_func=scoring_func, ) else: - group_score_topk_num = 1 - # for deepseek v3 - if topk_group == 4 and num_expert_group == 8 and top_k == 8: + if correction_bias is not None: group_score_topk_num = 2 + elif topk_group == 4 and num_expert_group == 8 and top_k == 8: + group_score_topk_num = 2 + else: + group_score_topk_num = 1 topk_weights, topk_ids = triton_grouped_topk( hidden_states=hidden_states, diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_H200/grouped_matmul:v1/{K=1536,N=2048,expert_num=64,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=false}_NVIDIA_H200.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_H200/grouped_matmul:v1/{K=1536,N=2048,expert_num=64,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=false}_NVIDIA_H200.json new file mode 100644 index 000000000..deb97363c --- /dev/null +++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_H200/grouped_matmul:v1/{K=1536,N=2048,expert_num=64,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=false}_NVIDIA_H200.json @@ -0,0 +1,110 @@ +{ + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "NEED_TRANS": false, + "num_stages": 2, + "num_warps": 8 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "NEED_TRANS": false, + "num_stages": 3, + "num_warps": 4 + }, + "16384": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "NEED_TRANS": false, + "num_stages": 3, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 16, + "NEED_TRANS": false, + "num_stages": 3, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "NEED_TRANS": false, + "num_stages": 4, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "NEED_TRANS": false, + "num_stages": 3, + "num_warps": 4 + }, + "400": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "NEED_TRANS": false, + "num_stages": 3, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "NEED_TRANS": false, + "num_stages": 3, + "num_warps": 8 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "NEED_TRANS": false, + "num_stages": 3, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 16, + "NEED_TRANS": false, + "num_stages": 4, + "num_warps": 4 + }, + "65536": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "NEED_TRANS": false, + "num_stages": 3, + "num_warps": 4 + }, + "8192": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "NEED_TRANS": false, + "num_stages": 3, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_H200/grouped_matmul:v1/{K=2048,N=3072,expert_num=64,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=4,use_fp8_w8a8=false}_NVIDIA_H200.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_H200/grouped_matmul:v1/{K=2048,N=3072,expert_num=64,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=4,use_fp8_w8a8=false}_NVIDIA_H200.json new file mode 100644 index 000000000..a6c93c3f6 --- /dev/null +++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_H200/grouped_matmul:v1/{K=2048,N=3072,expert_num=64,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=4,use_fp8_w8a8=false}_NVIDIA_H200.json @@ -0,0 +1,110 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "NEED_TRANS": false, + "num_stages": 3, + "num_warps": 4 + }, + "100": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "NEED_TRANS": false, + "num_stages": 2, + "num_warps": 8 + }, + "1024": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "NEED_TRANS": false, + "num_stages": 3, + "num_warps": 8 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "NEED_TRANS": false, + "num_stages": 2, + "num_warps": 8 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "NEED_TRANS": false, + "num_stages": 3, + "num_warps": 4 + }, + "16384": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "NEED_TRANS": false, + "num_stages": 3, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "NEED_TRANS": false, + "num_stages": 3, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "NEED_TRANS": false, + "num_stages": 2, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "NEED_TRANS": false, + "num_stages": 3, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "NEED_TRANS": false, + "num_stages": 3, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "NEED_TRANS": false, + "num_stages": 2, + "num_warps": 8 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "NEED_TRANS": false, + "num_stages": 3, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_H200/moe_align_fused:v1/{topk_num=4}_NVIDIA_H200.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_H200/moe_align_fused:v1/{topk_num=4}_NVIDIA_H200.json new file mode 100644 index 000000000..0f0c175b9 --- /dev/null +++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_H200/moe_align_fused:v1/{topk_num=4}_NVIDIA_H200.json @@ -0,0 +1,50 @@ +{ + "1": { + "BLOCK_SIZE": 128, + "num_warps": 8 + }, + "100": { + "BLOCK_SIZE": 128, + "num_warps": 8 + }, + "1024": { + "BLOCK_SIZE": 256, + "num_warps": 8 + }, + "128": { + "BLOCK_SIZE": 128, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE": 128, + "num_warps": 2 + }, + "16384": { + "BLOCK_SIZE": 256, + "num_warps": 8 + }, + "2048": { + "BLOCK_SIZE": 128, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE": 128, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE": 128, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE": 128, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE": 128, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE": 128, + "num_warps": 8 + } +} \ No newline at end of file diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_H200/moe_sum_reduce:v1/{hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=4}_NVIDIA_H200.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_H200/moe_sum_reduce:v1/{hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=4}_NVIDIA_H200.json new file mode 100644 index 000000000..c6c3d54ff --- /dev/null +++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_H200/moe_sum_reduce:v1/{hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=4}_NVIDIA_H200.json @@ -0,0 +1,74 @@ +{ + "1": { + "BLOCK_DIM": 64, + "BLOCK_M": 1, + "NUM_STAGE": 4, + "num_warps": 4 + }, + "100": { + "BLOCK_DIM": 1024, + "BLOCK_M": 1, + "NUM_STAGE": 1, + "num_warps": 8 + }, + "1024": { + "BLOCK_DIM": 1024, + "BLOCK_M": 1, + "NUM_STAGE": 1, + "num_warps": 8 + }, + "128": { + "BLOCK_DIM": 512, + "BLOCK_M": 1, + "NUM_STAGE": 4, + "num_warps": 1 + }, + "16": { + "BLOCK_DIM": 256, + "BLOCK_M": 1, + "NUM_STAGE": 1, + "num_warps": 8 + }, + "16384": { + "BLOCK_DIM": 1024, + "BLOCK_M": 1, + "NUM_STAGE": 4, + "num_warps": 4 + }, + "2048": { + "BLOCK_DIM": 1024, + "BLOCK_M": 1, + "NUM_STAGE": 1, + "num_warps": 1 + }, + "256": { + "BLOCK_DIM": 512, + "BLOCK_M": 1, + "NUM_STAGE": 1, + "num_warps": 4 + }, + "32": { + "BLOCK_DIM": 256, + "BLOCK_M": 1, + "NUM_STAGE": 1, + "num_warps": 2 + }, + "4096": { + "BLOCK_DIM": 1024, + "BLOCK_M": 1, + "NUM_STAGE": 1, + "num_warps": 8 + }, + "64": { + "BLOCK_DIM": 512, + "BLOCK_M": 1, + "NUM_STAGE": 4, + "num_warps": 16 + }, + "8": { + "BLOCK_DIM": 64, + "BLOCK_M": 1, + "NUM_STAGE": 2, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_H200/rotary_emb_fwd:v1/{HEAD_DIM=64,K_HEAD_NUM=1,Q_HEAD_NUM=20,dtype=torch.bfloat16}_NVIDIA_H200.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_H200/rotary_emb_fwd:v1/{HEAD_DIM=64,K_HEAD_NUM=1,Q_HEAD_NUM=20,dtype=torch.bfloat16}_NVIDIA_H200.json new file mode 100644 index 000000000..5601eab76 --- /dev/null +++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_H200/rotary_emb_fwd:v1/{HEAD_DIM=64,K_HEAD_NUM=1,Q_HEAD_NUM=20,dtype=torch.bfloat16}_NVIDIA_H200.json @@ -0,0 +1,74 @@ +{ + "1": { + "BLOCK_SEQ": 32, + "HEAD_PARALLEL_NUM": 16, + "num_stages": 5, + "num_warps": 8 + }, + "100": { + "BLOCK_SEQ": 1, + "HEAD_PARALLEL_NUM": 16, + "num_stages": 3, + "num_warps": 1 + }, + "1024": { + "BLOCK_SEQ": 1, + "HEAD_PARALLEL_NUM": 2, + "num_stages": 5, + "num_warps": 1 + }, + "128": { + "BLOCK_SEQ": 1, + "HEAD_PARALLEL_NUM": 8, + "num_stages": 5, + "num_warps": 1 + }, + "16": { + "BLOCK_SEQ": 1, + "HEAD_PARALLEL_NUM": 16, + "num_stages": 3, + "num_warps": 2 + }, + "16384": { + "BLOCK_SEQ": 8, + "HEAD_PARALLEL_NUM": 2, + "num_stages": 5, + "num_warps": 1 + }, + "2048": { + "BLOCK_SEQ": 1, + "HEAD_PARALLEL_NUM": 2, + "num_stages": 5, + "num_warps": 1 + }, + "256": { + "BLOCK_SEQ": 1, + "HEAD_PARALLEL_NUM": 8, + "num_stages": 1, + "num_warps": 1 + }, + "32": { + "BLOCK_SEQ": 1, + "HEAD_PARALLEL_NUM": 16, + "num_stages": 4, + "num_warps": 4 + }, + "4096": { + "BLOCK_SEQ": 2, + "HEAD_PARALLEL_NUM": 2, + "num_stages": 3, + "num_warps": 1 + }, + "64": { + "BLOCK_SEQ": 1, + "HEAD_PARALLEL_NUM": 16, + "num_stages": 1, + "num_warps": 1 + }, + "8": { + "BLOCK_SEQ": 1, + "HEAD_PARALLEL_NUM": 16, + "num_stages": 4, + "num_warps": 1 + } +} \ No newline at end of file diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_H200/silu_and_mul_fwd:v1/{N=10240,out_dtype=torch.bfloat16}_NVIDIA_H200.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_H200/silu_and_mul_fwd:v1/{N=10240,out_dtype=torch.bfloat16}_NVIDIA_H200.json new file mode 100644 index 000000000..b82f25e17 --- /dev/null +++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_H200/silu_and_mul_fwd:v1/{N=10240,out_dtype=torch.bfloat16}_NVIDIA_H200.json @@ -0,0 +1,74 @@ +{ + "1": { + "BLOCK_M": 1, + "BLOCK_N": 256, + "NUM_STAGES": 4, + "num_warps": 1 + }, + "100": { + "BLOCK_M": 8, + "BLOCK_N": 256, + "NUM_STAGES": 4, + "num_warps": 1 + }, + "1024": { + "BLOCK_M": 8, + "BLOCK_N": 128, + "NUM_STAGES": 4, + "num_warps": 1 + }, + "128": { + "BLOCK_M": 8, + "BLOCK_N": 128, + "NUM_STAGES": 4, + "num_warps": 1 + }, + "16": { + "BLOCK_M": 1, + "BLOCK_N": 256, + "NUM_STAGES": 1, + "num_warps": 1 + }, + "16384": { + "BLOCK_M": 8, + "BLOCK_N": 128, + "NUM_STAGES": 4, + "num_warps": 1 + }, + "2048": { + "BLOCK_M": 32, + "BLOCK_N": 256, + "NUM_STAGES": 4, + "num_warps": 1 + }, + "256": { + "BLOCK_M": 8, + "BLOCK_N": 256, + "NUM_STAGES": 4, + "num_warps": 1 + }, + "32": { + "BLOCK_M": 1, + "BLOCK_N": 256, + "NUM_STAGES": 2, + "num_warps": 1 + }, + "4096": { + "BLOCK_M": 8, + "BLOCK_N": 128, + "NUM_STAGES": 4, + "num_warps": 1 + }, + "64": { + "BLOCK_M": 8, + "BLOCK_N": 128, + "NUM_STAGES": 4, + "num_warps": 1 + }, + "8": { + "BLOCK_M": 1, + "BLOCK_N": 256, + "NUM_STAGES": 4, + "num_warps": 1 + } +} \ No newline at end of file diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_H200/silu_and_mul_fwd:v1/{N=1536,out_dtype=torch.bfloat16}_NVIDIA_H200.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_H200/silu_and_mul_fwd:v1/{N=1536,out_dtype=torch.bfloat16}_NVIDIA_H200.json new file mode 100644 index 000000000..ab4644621 --- /dev/null +++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_H200/silu_and_mul_fwd:v1/{N=1536,out_dtype=torch.bfloat16}_NVIDIA_H200.json @@ -0,0 +1,104 @@ +{ + "1": { + "BLOCK_M": 32, + "BLOCK_N": 128, + "NUM_STAGES": 2, + "num_warps": 4 + }, + "100": { + "BLOCK_M": 1, + "BLOCK_N": 256, + "NUM_STAGES": 2, + "num_warps": 1 + }, + "1024": { + "BLOCK_M": 8, + "BLOCK_N": 256, + "NUM_STAGES": 1, + "num_warps": 4 + }, + "128": { + "BLOCK_M": 1, + "BLOCK_N": 256, + "NUM_STAGES": 1, + "num_warps": 4 + }, + "16": { + "BLOCK_M": 1, + "BLOCK_N": 128, + "NUM_STAGES": 4, + "num_warps": 4 + }, + "16384": { + "BLOCK_M": 1, + "BLOCK_N": 256, + "NUM_STAGES": 4, + "num_warps": 1 + }, + "2048": { + "BLOCK_M": 8, + "BLOCK_N": 256, + "NUM_STAGES": 4, + "num_warps": 1 + }, + "256": { + "BLOCK_M": 1, + "BLOCK_N": 256, + "NUM_STAGES": 2, + "num_warps": 4 + }, + "32": { + "BLOCK_M": 1, + "BLOCK_N": 256, + "NUM_STAGES": 4, + "num_warps": 8 + }, + "4": { + "BLOCK_M": 1, + "BLOCK_N": 32, + "NUM_STAGES": 1, + "num_warps": 4 + }, + "400": { + "BLOCK_M": 1, + "BLOCK_N": 256, + "NUM_STAGES": 1, + "num_warps": 4 + }, + "4096": { + "BLOCK_M": 8, + "BLOCK_N": 256, + "NUM_STAGES": 4, + "num_warps": 1 + }, + "512": { + "BLOCK_M": 8, + "BLOCK_N": 256, + "NUM_STAGES": 4, + "num_warps": 4 + }, + "64": { + "BLOCK_M": 1, + "BLOCK_N": 256, + "NUM_STAGES": 1, + "num_warps": 1 + }, + "65536": { + "BLOCK_M": 1, + "BLOCK_N": 128, + "NUM_STAGES": 4, + "num_warps": 1 + }, + "8": { + "BLOCK_M": 1, + "BLOCK_N": 64, + "NUM_STAGES": 1, + "num_warps": 4 + }, + "8192": { + "BLOCK_M": 8, + "BLOCK_N": 256, + "NUM_STAGES": 4, + "num_warps": 1 + } +} \ No newline at end of file diff --git a/lightllm/models/__init__.py b/lightllm/models/__init__.py index 539b32dec..095f73679 100644 --- a/lightllm/models/__init__.py +++ b/lightllm/models/__init__.py @@ -18,6 +18,7 @@ from lightllm.models.gemma_2b.model import Gemma_2bTpPartModel from lightllm.models.phi3.model import Phi3TpPartModel from lightllm.models.deepseek2.model import Deepseek2TpPartModel +from lightllm.models.glm4_moe_lite.model import Glm4MoeLiteTpPartModel from lightllm.models.internvl.model import ( InternVLLlamaTpPartModel, InternVLPhi3TpPartModel, diff --git a/lightllm/models/deepseek2/model.py b/lightllm/models/deepseek2/model.py index f0739a8a8..c7f13bb63 100644 --- a/lightllm/models/deepseek2/model.py +++ b/lightllm/models/deepseek2/model.py @@ -43,6 +43,8 @@ def _init_some_value(self): self.qk_rope_head_dim = self.config["qk_rope_head_dim"] self.q_lora_rank = self.config["q_lora_rank"] self.kv_lora_rank = self.config["kv_lora_rank"] + # v_head_dim defaults to qk_nope_head_dim for DeepSeek-V2, but GLM-4.7-Flash has different value + self.v_head_dim = self.config.get("v_head_dim", self.qk_nope_head_dim) self.head_dim_ = self.kv_lora_rank + self.qk_rope_head_dim def _init_custom(self): diff --git a/lightllm/models/glm4_moe_lite/__init__.py b/lightllm/models/glm4_moe_lite/__init__.py new file mode 100644 index 000000000..b00657090 --- /dev/null +++ b/lightllm/models/glm4_moe_lite/__init__.py @@ -0,0 +1,4 @@ +from lightllm.models.glm4_moe_lite.model import Glm4MoeLiteTpPartModel +from lightllm.models.glm4_moe_lite.infer_struct import Glm4MoeLiteInferStateInfo + +__all__ = ["Glm4MoeLiteTpPartModel", "Glm4MoeLiteInferStateInfo"] diff --git a/lightllm/models/glm4_moe_lite/infer_struct.py b/lightllm/models/glm4_moe_lite/infer_struct.py new file mode 100644 index 000000000..92c350abc --- /dev/null +++ b/lightllm/models/glm4_moe_lite/infer_struct.py @@ -0,0 +1,12 @@ +from lightllm.models.deepseek2.infer_struct import Deepseek2InferStateInfo + + +class Glm4MoeLiteInferStateInfo(Deepseek2InferStateInfo): + """Inference state for GLM-4.7-Flash (glm4_moe_lite architecture). + + Inherits from Deepseek2InferStateInfo as GLM-4.7-Flash uses the same + MLA (Multi-Head Latent Attention) mechanism as DeepSeek-V2/V3. + """ + + def __init__(self): + super().__init__() diff --git a/lightllm/models/glm4_moe_lite/layer_infer/__init__.py b/lightllm/models/glm4_moe_lite/layer_infer/__init__.py new file mode 100644 index 000000000..a95580535 --- /dev/null +++ b/lightllm/models/glm4_moe_lite/layer_infer/__init__.py @@ -0,0 +1,3 @@ +from lightllm.models.glm4_moe_lite.layer_infer.transformer_layer_infer import Glm4MoeLiteTransformerLayerInfer + +__all__ = ["Glm4MoeLiteTransformerLayerInfer"] diff --git a/lightllm/models/glm4_moe_lite/layer_infer/transformer_layer_infer.py b/lightllm/models/glm4_moe_lite/layer_infer/transformer_layer_infer.py new file mode 100644 index 000000000..bcea872fd --- /dev/null +++ b/lightllm/models/glm4_moe_lite/layer_infer/transformer_layer_infer.py @@ -0,0 +1,96 @@ +import os +import torch +import torch.distributed as dist +import triton +from functools import partial +from lightllm.models.deepseek2.layer_infer.transformer_layer_infer import Deepseek2TransformerLayerInfer +from lightllm.models.llama.layer_infer.transformer_layer_infer import LlamaTransformerLayerInfer +from lightllm.distributed.communication_op import reduce_scatter_tensor + + +class Glm4MoeLiteTransformerLayerInfer(Deepseek2TransformerLayerInfer): + def __init__(self, layer_num, network_config): + self._glm4_layer_num = layer_num + self._glm4_first_k_dense = network_config.get("first_k_dense_replace", 0) + self._glm4_has_routed_experts = network_config.get("n_routed_experts") is not None + super().__init__(layer_num, network_config) + + @property + def is_moe(self): + return self._glm4_has_routed_experts and self._glm4_layer_num >= self._glm4_first_k_dense + + @is_moe.setter + def is_moe(self, value): + pass + + def _bind_ffn(self): + if self.is_moe: + moe_mode = os.environ.get("MOE_MODE", "TP") + if moe_mode == "EP": + self._ffn = partial(Deepseek2TransformerLayerInfer._moe_ffn_edp, self) + self._tpsp_ffn = self._tpsp_ffn_ep + else: + self._ffn = partial(Glm4MoeLiteTransformerLayerInfer._moe_ffn, self) + self._tpsp_ffn = self._tpsp_ffn_tp + else: + self._ffn = partial(LlamaTransformerLayerInfer._ffn, self) + self._tpsp_ffn = self._tpsp_ffn_tp + + def _get_o(self, input: torch.Tensor, infer_state, layer_weight) -> torch.Tensor: + if input.shape[2] == self.kv_lora_rank: + input = layer_weight.v_b_proj_.bmm(input.transpose(0, 1)).transpose(0, 1) + o_tensor = layer_weight.o_weight_.mm(input.reshape(-1, self.tp_q_head_num_ * self.v_head_dim)) + return o_tensor + + def _tpsp_get_o(self, input, infer_state, layer_weight) -> torch.Tensor: + if infer_state.need_dp_prefill_balance: + input = infer_state._all_to_all_balance_get(data=input) + + if input.shape[2] == self.kv_lora_rank: + input = layer_weight.v_b_proj_.bmm(input.transpose(0, 1)).transpose(0, 1) + + input = input.reshape(-1, self.tp_q_head_num_ * self.v_head_dim) + dest_size = triton.cdiv(input.shape[0], self.tp_world_size_) * self.tp_world_size_ + o_tensor = self.alloc_tensor((dest_size, self.embed_dim_), dtype=input.dtype, device=input.device) + layer_weight.o_weight_.mm(input, out=o_tensor[0 : len(infer_state.input_ids), :]) + e_o_tensor = o_tensor[len(infer_state.input_ids) :, :] + if e_o_tensor.shape[0] > 0: + e_o_tensor.fill_(0) + + if self.tp_world_size_ > 1: + sp_token_num = o_tensor.shape[0] // self.tp_world_size_ + reduce_o_tensor = self.alloc_tensor((sp_token_num, self.embed_dim_), dtype=input.dtype, device=input.device) + reduce_scatter_tensor( + output=reduce_o_tensor, + input=o_tensor, + op=dist.ReduceOp.SUM, + group=infer_state.dist_group, + async_op=False, + ) + o_tensor = reduce_o_tensor + + return o_tensor + + def _moe_ffn(self, input, infer_state, layer_weight): + hidden_states = input.view(-1, self.embed_dim_) + num_tokens, hidden_dim = hidden_states.shape + + if self.n_shared_experts is not None and layer_weight.num_fused_shared_experts == 0: + shared_output = LlamaTransformerLayerInfer._ffn(self, hidden_states, infer_state, layer_weight) + + router_logits = layer_weight.moe_gate.mm(hidden_states.to(torch.float32)) + + layer_weight.experts.experts( + hidden_states, + router_logits=router_logits, + top_k=self.num_experts_per_tok, + renormalize=self.norm_topk_prob, + use_grouped_topk=self.n_group, + topk_group=self.topk_group, + num_expert_group=self.n_group, + ) + + if self.n_shared_experts is not None and layer_weight.num_fused_shared_experts == 0: + hidden_states.add_(shared_output) + + return hidden_states.view(num_tokens, hidden_dim) diff --git a/lightllm/models/glm4_moe_lite/layer_weights/__init__.py b/lightllm/models/glm4_moe_lite/layer_weights/__init__.py new file mode 100644 index 000000000..1fd5e36f8 --- /dev/null +++ b/lightllm/models/glm4_moe_lite/layer_weights/__init__.py @@ -0,0 +1,3 @@ +from lightllm.models.glm4_moe_lite.layer_weights.transformer_layer_weight import Glm4MoeLiteTransformerLayerWeight + +__all__ = ["Glm4MoeLiteTransformerLayerWeight"] diff --git a/lightllm/models/glm4_moe_lite/layer_weights/transformer_layer_weight.py b/lightllm/models/glm4_moe_lite/layer_weights/transformer_layer_weight.py new file mode 100644 index 000000000..b4d41fd47 --- /dev/null +++ b/lightllm/models/glm4_moe_lite/layer_weights/transformer_layer_weight.py @@ -0,0 +1,113 @@ +import os +import torch +from lightllm.models.deepseek2.layer_weights.transformer_layer_weight import Deepseek2TransformerLayerWeight +from lightllm.common.basemodel.layer_weights.meta_weights import ROWMMWeight +from lightllm.common.basemodel.layer_weights.meta_weights.fused_moe_weight_tp import create_tp_moe_wegiht_obj +from lightllm.common.basemodel.layer_weights.meta_weights.fused_moe_weight_ep import FusedMoeWeightEP + + +class Glm4MoeLiteTransformerLayerWeight(Deepseek2TransformerLayerWeight): + def __init__(self, layer_num, data_type, network_config, quant_cfg=None): + super().__init__(layer_num, data_type, network_config, quant_cfg) + + def _parse_config(self): + from lightllm.common.basemodel import TransformerLayerWeight + + TransformerLayerWeight._parse_config(self) + + self.is_moe = self.network_config_.get( + "n_routed_experts" + ) is not None and self.layer_num_ >= self.network_config_.get("first_k_dense_replace", 0) + + self.tp_q_head_num_ = self.network_config_["num_attention_heads"] + self.tp_q_head_num_ = self.tp_q_head_num_ // self.tp_world_size_ + self.n_routed_experts = self.network_config_.get("n_routed_experts") + self.q_lora_rank = self.network_config_.get("q_lora_rank") + self.qk_nope_head_dim = self.network_config_["qk_nope_head_dim"] + self.qk_rope_head_dim = self.network_config_["qk_rope_head_dim"] + self.v_head_dim = self.network_config_["v_head_dim"] + self.num_attention_heads = self.network_config_["num_attention_heads"] + self.kv_lora_rank = self.network_config_["kv_lora_rank"] + + from lightllm.utils.envs_utils import get_env_start_args + import os + + self.num_fused_shared_experts = 0 + if get_env_start_args().enable_fused_shared_experts and self.is_moe: + moe_mode = os.getenv("MOE_MODE", "TP") + assert moe_mode == "TP" + self.num_fused_shared_experts = self.network_config_.get("n_shared_experts", 0) + + def _load_kb(self, kv_b_proj_): + kv_dim = self.qk_nope_head_dim + self.v_head_dim + k_b_proj_ = kv_b_proj_.view(self.num_attention_heads, kv_dim, self.kv_lora_rank)[:, : self.qk_nope_head_dim, :] + return k_b_proj_.contiguous().to(kv_b_proj_.dtype) + + def _load_kb_scale(self, kv_b_proj_, block_size): + kv_dim = self.qk_nope_head_dim + self.v_head_dim + k_b_proj_scale_ = kv_b_proj_.view( + self.num_attention_heads, kv_dim // block_size, self.kv_lora_rank // block_size + )[:, : self.qk_nope_head_dim // block_size, :] + return k_b_proj_scale_.contiguous().to(kv_b_proj_.dtype) + + def _load_vb(self, kv_b_proj_): + kv_dim = self.qk_nope_head_dim + self.v_head_dim + v_b_proj_ = kv_b_proj_.T.view(self.kv_lora_rank, self.num_attention_heads, kv_dim)[ + :, :, self.qk_nope_head_dim : + ].transpose(0, 1) + return v_b_proj_.contiguous().to(kv_b_proj_.dtype) + + def _load_vb_scale(self, kv_b_proj_scale_, block_size): + kv_dim = self.qk_nope_head_dim + self.v_head_dim + v_b_proj_scale_ = kv_b_proj_scale_.T.view( + self.kv_lora_rank // block_size, + self.num_attention_heads, + kv_dim // block_size, + )[:, :, self.qk_nope_head_dim // block_size :].transpose(0, 1) + return v_b_proj_scale_.contiguous().to(kv_b_proj_scale_.dtype) + + def _init_moe(self): + moe_intermediate_size = self.network_config_["moe_intermediate_size"] + + self.moe_gate = ROWMMWeight( + weight_names=f"model.layers.{self.layer_num_}.mlp.gate.weight", + data_type=torch.float32, + layer_num=self.layer_num_, + name="moe_gate", + tp_rank=0, + tp_world_size=1, + ) + + if self.num_fused_shared_experts == 0: + self._load_mlp(f"model.layers.{self.layer_num_}.mlp.shared_experts") + + moe_mode = os.getenv("MOE_MODE", "TP") + assert moe_mode in ["EP", "TP"] + if moe_mode == "TP": + self.experts = create_tp_moe_wegiht_obj( + gate_proj_name="gate_proj", + down_proj_name="down_proj", + up_proj_name="up_proj", + e_score_correction_bias_name=self.e_score_correction_bias_name, + weight_prefix=f"model.layers.{self.layer_num_}.mlp.experts", + n_routed_experts=self.n_routed_experts, + num_fused_shared_experts=self.num_fused_shared_experts, + split_inter_size=moe_intermediate_size // self.tp_world_size_, + data_type=self.data_type_, + network_config=self.network_config_, + layer_num=self.layer_num_, + quant_cfg=self.quant_cfg, + ) + else: + self.experts = FusedMoeWeightEP( + gate_proj_name="gate_proj", + down_proj_name="down_proj", + up_proj_name="up_proj", + e_score_correction_bias_name=self.e_score_correction_bias_name, + weight_prefix=f"model.layers.{self.layer_num_}.mlp.experts", + n_routed_experts=self.n_routed_experts, + data_type=self.data_type_, + network_config=self.network_config_, + layer_num=self.layer_num_, + quant_cfg=self.quant_cfg, + ) diff --git a/lightllm/models/glm4_moe_lite/model.py b/lightllm/models/glm4_moe_lite/model.py new file mode 100644 index 000000000..a5970ab59 --- /dev/null +++ b/lightllm/models/glm4_moe_lite/model.py @@ -0,0 +1,74 @@ +import torch +from lightllm.models.registry import ModelRegistry +from lightllm.models.deepseek2.model import Deepseek2TpPartModel +from lightllm.models.glm4_moe_lite.layer_infer.transformer_layer_infer import Glm4MoeLiteTransformerLayerInfer +from lightllm.models.glm4_moe_lite.layer_weights.transformer_layer_weight import Glm4MoeLiteTransformerLayerWeight +from lightllm.models.glm4_moe_lite.infer_struct import Glm4MoeLiteInferStateInfo +from lightllm.distributed.communication_op import dist_group_manager +from lightllm.utils.log_utils import init_logger + +logger = init_logger(__name__) + + +@ModelRegistry("glm4_moe_lite") +class Glm4MoeLiteTpPartModel(Deepseek2TpPartModel): + + transformer_weight_class = Glm4MoeLiteTransformerLayerWeight + transformer_layer_infer_class = Glm4MoeLiteTransformerLayerInfer + infer_state_class = Glm4MoeLiteInferStateInfo + + def __init__(self, kvargs): + super().__init__(kvargs) + + def _init_config(self): + super()._init_config() + + if "moe_layer_freq" not in self.config and self.config.get("n_routed_experts"): + self.config["moe_layer_freq"] = 1 + + if "routed_scaling_factor" not in self.config: + self.config["routed_scaling_factor"] = 1.8 + + if "topk_method" not in self.config: + self.config["topk_method"] = "noaux_tc" + + if "scoring_func" not in self.config: + self.config["scoring_func"] = "sigmoid" + + logger.info( + f"GLM-4.7-Flash config: " + f"n_routed_experts={self.config.get('n_routed_experts')}, " + f"n_shared_experts={self.config.get('n_shared_experts')}, " + f"num_experts_per_tok={self.config.get('num_experts_per_tok')}, " + f"first_k_dense_replace={self.config.get('first_k_dense_replace')}, " + f"routed_scaling_factor={self.config.get('routed_scaling_factor')}, " + f"scoring_func={self.config.get('scoring_func')}" + ) + + def _init_custom(self): + self._init_to_get_yarn_rotary() + dist_group_manager.new_deepep_group(self.config["n_routed_experts"], self.config["hidden_size"]) + + def _init_to_get_yarn_rotary(self): + rope_scaling = self.config.get("rope_scaling") + + if rope_scaling is None: + self._init_glm4_standard_rotary() + else: + super()._init_to_get_yarn_rotary() + + def _init_glm4_standard_rotary(self): + rope_theta = self.config.get("rope_theta", 1000000.0) + qk_rope_head_dim = self.config.get("qk_rope_head_dim", 64) + max_position_embeddings = self.config.get("max_position_embeddings", 202752) + + dim = qk_rope_head_dim + + inv_freq = 1.0 / (rope_theta ** (torch.arange(0, dim, 2, device="cpu", dtype=torch.float32) / dim)) + + max_seq_len = max(max_position_embeddings, self.max_seq_length) + t = torch.arange(max_seq_len, device="cpu", dtype=torch.float32) + freqs = torch.outer(t, inv_freq) + + self._cos_cached = torch.cos(freqs).to(self.data_type).cuda() + self._sin_cached = torch.sin(freqs).to(self.data_type).cuda() diff --git a/lightllm/models/glm4_moe_lite_mtp/__init__.py b/lightllm/models/glm4_moe_lite_mtp/__init__.py new file mode 100644 index 000000000..96b6659c8 --- /dev/null +++ b/lightllm/models/glm4_moe_lite_mtp/__init__.py @@ -0,0 +1,3 @@ +from lightllm.models.glm4_moe_lite_mtp.model import Glm4MoeLiteMTPModel + +__all__ = ["Glm4MoeLiteMTPModel"] diff --git a/lightllm/models/glm4_moe_lite_mtp/layer_infer/__init__.py b/lightllm/models/glm4_moe_lite_mtp/layer_infer/__init__.py new file mode 100644 index 000000000..e357bfa19 --- /dev/null +++ b/lightllm/models/glm4_moe_lite_mtp/layer_infer/__init__.py @@ -0,0 +1,3 @@ +from lightllm.models.glm4_moe_lite_mtp.layer_infer.pre_layer_infer import Glm4MoeLiteMTPPreLayerInfer + +__all__ = ["Glm4MoeLiteMTPPreLayerInfer"] diff --git a/lightllm/models/glm4_moe_lite_mtp/layer_infer/pre_layer_infer.py b/lightllm/models/glm4_moe_lite_mtp/layer_infer/pre_layer_infer.py new file mode 100644 index 000000000..6994d21f7 --- /dev/null +++ b/lightllm/models/glm4_moe_lite_mtp/layer_infer/pre_layer_infer.py @@ -0,0 +1,82 @@ +import torch + +from lightllm.models.glm4_moe_lite_mtp.layer_weights.pre_and_post_layer_weight import ( + Glm4MoeLiteMTPPreAndPostLayerWeight, +) +from lightllm.models.glm4_moe_lite.infer_struct import Glm4MoeLiteInferStateInfo +from lightllm.models.llama.layer_infer.pre_layer_infer import LlamaPreLayerInfer + + +class Glm4MoeLiteMTPPreLayerInfer(LlamaPreLayerInfer): + def __init__(self, network_config): + super().__init__(network_config) + self.eps_ = network_config["rms_norm_eps"] + self.hidden_size = network_config["hidden_size"] + + def _mtp_context_forward( + self, + input_embdings, + infer_state: Glm4MoeLiteInferStateInfo, + layer_weight: Glm4MoeLiteMTPPreAndPostLayerWeight, + ): + tgt_embdings = infer_state.mtp_draft_input_hiddens + assert ( + input_embdings.shape[0] == tgt_embdings.shape[0] + ), f"shape {input_embdings.shape} != shape {tgt_embdings.shape}" + + layer_weight.enorm_weight_.rmsnorm_forward( + input=input_embdings, + eps=self.eps_, + out=input_embdings, + ) + layer_weight.hnorm_weight_.rmsnorm_forward( + input=tgt_embdings, + eps=self.eps_, + out=tgt_embdings, + ) + cat_embdings = torch.cat((input_embdings, tgt_embdings), dim=-1) + + ans_logics = layer_weight.eh_proj_weight_.mm(cat_embdings) + return ans_logics + + def _mtp_token_forward( + self, + input_embdings, + infer_state: Glm4MoeLiteInferStateInfo, + layer_weight: Glm4MoeLiteMTPPreAndPostLayerWeight, + ): + tgt_embdings = infer_state.mtp_draft_input_hiddens + assert input_embdings.shape[0] == tgt_embdings.shape[0] + + layer_weight.enorm_weight_.rmsnorm_forward( + input=input_embdings, + eps=self.eps_, + out=input_embdings, + ) + layer_weight.hnorm_weight_.rmsnorm_forward( + input=tgt_embdings, + eps=self.eps_, + out=tgt_embdings, + ) + cat_embdings = torch.cat((input_embdings, tgt_embdings), dim=-1) + + ans_logics = layer_weight.eh_proj_weight_.mm(cat_embdings) + return ans_logics + + def context_forward( + self, + input_ids, + infer_state: Glm4MoeLiteInferStateInfo, + layer_weight: Glm4MoeLiteMTPPreAndPostLayerWeight, + ): + input_embdings = super().context_forward(input_ids, infer_state, layer_weight) + return self._mtp_context_forward(input_embdings, infer_state, layer_weight) + + def token_forward( + self, + input_ids, + infer_state: Glm4MoeLiteInferStateInfo, + layer_weight: Glm4MoeLiteMTPPreAndPostLayerWeight, + ): + input_embdings = super().token_forward(input_ids, infer_state, layer_weight) + return self._mtp_token_forward(input_embdings, infer_state, layer_weight) diff --git a/lightllm/models/glm4_moe_lite_mtp/layer_weights/__init__.py b/lightllm/models/glm4_moe_lite_mtp/layer_weights/__init__.py new file mode 100644 index 000000000..57fe578cf --- /dev/null +++ b/lightllm/models/glm4_moe_lite_mtp/layer_weights/__init__.py @@ -0,0 +1,5 @@ +from lightllm.models.glm4_moe_lite_mtp.layer_weights.pre_and_post_layer_weight import ( + Glm4MoeLiteMTPPreAndPostLayerWeight, +) + +__all__ = ["Glm4MoeLiteMTPPreAndPostLayerWeight"] diff --git a/lightllm/models/glm4_moe_lite_mtp/layer_weights/pre_and_post_layer_weight.py b/lightllm/models/glm4_moe_lite_mtp/layer_weights/pre_and_post_layer_weight.py new file mode 100644 index 000000000..a84ada72f --- /dev/null +++ b/lightllm/models/glm4_moe_lite_mtp/layer_weights/pre_and_post_layer_weight.py @@ -0,0 +1,43 @@ +from lightllm.common.basemodel import PreAndPostLayerWeight +from lightllm.common.basemodel.layer_weights.meta_weights import ( + EmbeddingWeight, + LMHeadWeight, + NoTpNormWeight, + ROWMMWeight, +) + + +class Glm4MoeLiteMTPPreAndPostLayerWeight(PreAndPostLayerWeight): + def __init__(self, data_type, network_config): + super().__init__(data_type, network_config) + + mtp_layer_idx = network_config["num_hidden_layers"] + + self.eh_proj_weight_ = ROWMMWeight( + weight_names=f"model.layers.{mtp_layer_idx}.eh_proj.weight", + data_type=self.data_type_, + name="eh_proj", + tp_rank=0, + tp_world_size=1, + ) + + self.enorm_weight_ = NoTpNormWeight( + weight_name=f"model.layers.{mtp_layer_idx}.enorm.weight", + data_type=self.data_type_, + bias_name=None, + ) + + self.hnorm_weight_ = NoTpNormWeight( + weight_name=f"model.layers.{mtp_layer_idx}.hnorm.weight", + data_type=self.data_type_, + bias_name=None, + ) + + self.final_norm_weight_ = NoTpNormWeight( + weight_name=f"model.layers.{mtp_layer_idx}.shared_head.norm.weight", + data_type=self.data_type_, + bias_name=None, + ) + + self.wte_weight_: EmbeddingWeight = None + self.lm_head_weight_: LMHeadWeight = None diff --git a/lightllm/models/glm4_moe_lite_mtp/model.py b/lightllm/models/glm4_moe_lite_mtp/model.py new file mode 100644 index 000000000..1e7e68e22 --- /dev/null +++ b/lightllm/models/glm4_moe_lite_mtp/model.py @@ -0,0 +1,88 @@ +from typing import List +from lightllm.models.glm4_moe_lite.model import Glm4MoeLiteTpPartModel +from lightllm.models.glm4_moe_lite_mtp.layer_infer.pre_layer_infer import Glm4MoeLiteMTPPreLayerInfer +from lightllm.models.glm4_moe_lite_mtp.layer_weights.pre_and_post_layer_weight import ( + Glm4MoeLiteMTPPreAndPostLayerWeight, +) +from lightllm.common.basemodel import TpPartBaseModel +from lightllm.common.basemodel.basemodel import load_hf_weights + + +class Glm4MoeLiteMTPModel(Glm4MoeLiteTpPartModel): + + pre_and_post_weight_class = Glm4MoeLiteMTPPreAndPostLayerWeight + pre_layer_infer_class = Glm4MoeLiteMTPPreLayerInfer + + def __init__(self, kvargs: dict): + self._pre_init(kvargs) + super().__init__(kvargs) + + def _pre_init(self, kvargs: dict): + self.main_model: TpPartBaseModel = kvargs.pop("main_model") + self.mtp_previous_draft_models: List[TpPartBaseModel] = kvargs.pop("mtp_previous_draft_models") + + def _init_custom(self): + self._cos_cached = self.main_model._cos_cached + self._sin_cached = self.main_model._sin_cached + + def _init_req_manager(self): + self.req_manager = self.main_model.req_manager + + def _init_mem_manager(self): + self.mem_manager = self.main_model.mem_manager + + def _init_weights(self, start_layer_index=None): + assert start_layer_index is None + + mtp_layer_start = self.config["num_hidden_layers"] + num_mtp_layers = self.config.get("num_nextn_predict_layers", 1) + + self.pre_post_weight = self.pre_and_post_weight_class(self.data_type, network_config=self.config) + + self.trans_layers_weight = [ + self.transformer_weight_class( + i, + self.data_type, + network_config=self.config, + quant_cfg=self.quant_cfg, + ) + for i in range(mtp_layer_start, mtp_layer_start + num_mtp_layers) + ] + + load_hf_weights( + self.data_type, + weight_dir=self.weight_dir_, + pre_post_layer=self.pre_post_weight, + transformer_layer_list=self.trans_layers_weight, + weight_dict=self.weight_dict, + ) + + self.pre_post_weight.verify_load() + [weight.verify_load() for weight in self.trans_layers_weight] + + self.pre_post_weight.wte_weight_ = self.main_model.pre_post_weight.wte_weight_ + self.pre_post_weight.lm_head_weight_ = self.main_model.pre_post_weight.lm_head_weight_ + + def _init_infer_layer(self, start_layer_index=None): + assert start_layer_index is None + + self.pre_infer = self.pre_layer_infer_class(network_config=self.config) + self.post_infer = self.post_layer_infer_class(network_config=self.config) + + total_pre_layers_num = len(self.main_model.layers_infer) + total_pre_layers_num += sum( + [len(previous_model.layers_infer) for previous_model in self.mtp_previous_draft_models] + ) + + num_mtp_layers = self.config.get("num_nextn_predict_layers", 1) + self.layers_infer = [ + self.transformer_layer_infer_class(i, network_config=self.config) + for i in range(total_pre_layers_num, total_pre_layers_num + num_mtp_layers) + ] + + def _init_some_value(self): + super()._init_some_value() + self.layers_num = self.config.get("num_nextn_predict_layers", 1) + + def autotune_layers(self): + return self.config.get("num_nextn_predict_layers", 1) diff --git a/lightllm/server/api_cli.py b/lightllm/server/api_cli.py index 44cc38822..296cea6b4 100644 --- a/lightllm/server/api_cli.py +++ b/lightllm/server/api_cli.py @@ -119,7 +119,7 @@ def make_argument_parser() -> argparse.ArgumentParser: parser.add_argument( "--batch_max_tokens", type=int, - default=None, + default=16384, help="max tokens num for new cat batch, it control prefill batch size to Preventing OOM", ) parser.add_argument( @@ -128,7 +128,7 @@ def make_argument_parser() -> argparse.ArgumentParser: parser.add_argument( "--tool_call_parser", type=str, - choices=["qwen25", "llama3", "mistral", "deepseekv3", "qwen", "deepseekv31"], + choices=["qwen25", "llama3", "mistral", "deepseekv3", "qwen", "deepseekv31", "glm47", "kimi_k2"], default=None, help="tool call parser type", ) @@ -259,7 +259,7 @@ def make_argument_parser() -> argparse.ArgumentParser: ) parser.add_argument("--disable_dynamic_prompt_cache", action="store_true", help="disable dynamic prompt cache") - parser.add_argument("--chunked_prefill_size", type=int, default=4096, help="chunked prefill size") + parser.add_argument("--chunked_prefill_size", type=int, default=8192, help="chunked prefill size") parser.add_argument("--disable_chunked_prefill", action="store_true", help="whether to disable chunked prefill") parser.add_argument("--diverse_mode", action="store_true", help="diversity generation mode") parser.add_argument("--token_healing_mode", action="store_true", help="code model infer mode") diff --git a/lightllm/server/function_call_parser.py b/lightllm/server/function_call_parser.py index 1620cff13..9214715b1 100644 --- a/lightllm/server/function_call_parser.py +++ b/lightllm/server/function_call_parser.py @@ -241,7 +241,7 @@ def parse_streaming_increment(self, new_text: str, tools: List[Tool]) -> Streami if start_idx >= len(current_text): return StreamingParseResult() - (obj, end_idx) = _partial_json_loads(current_text[start_idx:], flags) + obj, end_idx = _partial_json_loads(current_text[start_idx:], flags) is_current_complete = _is_complete_json(current_text[start_idx : start_idx + end_idx]) @@ -1173,6 +1173,276 @@ def parse_streaming_increment(self, new_text: str, tools: List[Tool]) -> Streami return StreamingParseResult(normal_text=current_text) +class Glm47Detector(BaseFormatDetector): + """ + Detector for GLM-4.7/GLM-4.7-Flash model function call format. + + The GLM-4.7 format uses an XML-style envelope with arg_key/arg_value pairs + instead of JSON arguments. + + Format Structure: + ``` + function_name + param1 + value1 + param2 + value2 + + ``` + + Example: + ``` + tool_brave_web_search_post + query + test search + count + 5 + + ``` + + Key Components: + - Tool Call Tags: `` and `` wrap each individual call + - Function Name: Appears on the first line after `` + - Arguments: Pairs of `name` and `value` + + Reference: https://github.com/vllm-project/vllm/blob/main/vllm/tool_parsers/glm4_moe_tool_parser.py + """ + + def __init__(self): + super().__init__() + self.bot_token = "" + self.eot_token = "" + self.tool_call_separator = "\n" + + # Regex patterns for parsing GLM-4.7 tool calls + # Match complete tool call blocks + self.func_call_regex = re.compile(r".*?", re.DOTALL) + # Extract function name and arguments from a tool call block + # Function name can be followed by newline OR directly by + # Pattern: function_name(\n|)... + self.func_detail_regex = re.compile( + r"([^<\n]+?)(?:\n|(?=)|(?=))(.*?)", re.DOTALL + ) + # Extract arg_key/arg_value pairs + self.func_arg_regex = re.compile(r"(.*?)\s*(.*?)", re.DOTALL) + + self._last_arguments = "" + self._normal_text_buffer = "" + + def has_tool_call(self, text: str) -> bool: + """Check if the text contains a GLM-4.7 format tool call.""" + return self.bot_token in text + + def _parse_xml_arguments(self, arg_text: str) -> dict: + """ + Parse XML-style arguments into a dictionary. + + Args: + arg_text: The text containing / pairs + + Returns: + Dictionary of argument name to value + """ + if not arg_text: + return {} + + args = {} + matches = self.func_arg_regex.findall(arg_text) + for key, value in matches: + key = key.strip() + value = value.strip() + # Try to parse value as JSON for complex types (arrays, objects, numbers, booleans) + try: + parsed_value = json.loads(value) + args[key] = parsed_value + except (json.JSONDecodeError, ValueError): + # Keep as string if not valid JSON + args[key] = value + return args + + def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult: + """ + One-time parsing: Detects and parses tool calls in the provided text. + + :param text: The complete text to parse. + :param tools: List of available tools. + :return: StreamingParseResult with normal_text and parsed calls. + """ + idx = text.find(self.bot_token) + normal_text = text[:idx].strip() if idx != -1 else text + + if self.bot_token not in text: + return StreamingParseResult(normal_text=normal_text, calls=[]) + + tool_indices = self._get_tool_indices(tools) + calls = [] + + # Find all ... blocks + match_result_list = self.func_call_regex.findall(text) + + for match_result in match_result_list: + try: + # Extract function name and arguments + func_detail = self.func_detail_regex.search(match_result) + if not func_detail: + logger.warning(f"Failed to parse GLM-4.7 tool call: {match_result}") + continue + + func_name = func_detail.group(1).strip() + arg_text = func_detail.group(2) if func_detail.group(2) else "" + + # Validate function name + if func_name not in tool_indices: + logger.warning(f"Model attempted to call undefined function: {func_name}") + continue + + # Parse XML arguments to JSON + func_args = self._parse_xml_arguments(arg_text) + + calls.append( + ToolCallItem( + tool_index=tool_indices[func_name], + name=func_name, + parameters=json.dumps(func_args, ensure_ascii=False), + ) + ) + except Exception as e: + logger.warning(f"Failed to parse GLM-4.7 tool call: {match_result}, error: {str(e)}") + continue + + return StreamingParseResult(normal_text=normal_text, calls=calls) + + def parse_streaming_increment(self, new_text: str, tools: List[Tool]) -> StreamingParseResult: + """ + Streaming incremental parsing for GLM-4.7 tool calls. + + This handles the streaming case where tool calls arrive incrementally. + """ + self._buffer += new_text + current_text = self._buffer + + # Check if we have a tool call starting + if not self.has_tool_call(current_text): + # Check for partial bot_token at the end + partial_len = self._ends_with_partial_token(current_text, self.bot_token) + if partial_len: + # Might be partial bot_token, keep buffering + return StreamingParseResult() + + # No tool call, emit as normal text + self._buffer = "" + # Clean up any stray end tokens + if self.eot_token in new_text: + new_text = new_text.replace(self.eot_token, "") + return StreamingParseResult(normal_text=new_text) + + # Build tool indices if not already built + if not hasattr(self, "_tool_indices"): + self._tool_indices = self._get_tool_indices(tools) + + calls: List[ToolCallItem] = [] + + try: + # Check if we have a complete tool call + if self.eot_token in current_text: + # We have at least one complete tool call + # Parse all complete tool calls + result = self.detect_and_parse(current_text, tools) + + # Find the end of the last complete tool call + last_end = current_text.rfind(self.eot_token) + if last_end != -1: + remaining = current_text[last_end + len(self.eot_token) :] + self._buffer = remaining.lstrip() + else: + self._buffer = "" + + # Reset state for next tool call + self.current_tool_id = -1 + self.current_tool_name_sent = False + self._last_arguments = "" + + return result + + # We have a partial tool call - try to stream it + # Extract what we can from the partial tool call + tool_call_start = current_text.find(self.bot_token) + if tool_call_start == -1: + return StreamingParseResult() + + # Get content after + content_after_start = current_text[tool_call_start + len(self.bot_token) :] + + # Try to extract function name (first line after ) + newline_pos = content_after_start.find("\n") + if newline_pos == -1: + # Still waiting for function name to complete + return StreamingParseResult() + + func_name = content_after_start[:newline_pos].strip() + + # Initialize state if this is the first tool call + if self.current_tool_id == -1: + self.current_tool_id = 0 + self.prev_tool_call_arr = [] + self.streamed_args_for_tool = [""] + + # Ensure we have enough entries + while len(self.prev_tool_call_arr) <= self.current_tool_id: + self.prev_tool_call_arr.append({}) + while len(self.streamed_args_for_tool) <= self.current_tool_id: + self.streamed_args_for_tool.append("") + + # Check if function name is valid + if func_name and func_name in self._tool_indices: + if not self.current_tool_name_sent: + # Send function name first + calls.append( + ToolCallItem( + tool_index=self.current_tool_id, + name=func_name, + parameters="", + ) + ) + self.current_tool_name_sent = True + self.prev_tool_call_arr[self.current_tool_id] = { + "name": func_name, + "arguments": {}, + } + else: + # Stream arguments incrementally + arg_text = content_after_start[newline_pos + 1 :] + current_args = self._parse_xml_arguments(arg_text) + + if current_args: + current_args_json = json.dumps(current_args, ensure_ascii=False) + prev_args = self.prev_tool_call_arr[self.current_tool_id].get("arguments", {}) + prev_args_json = json.dumps(prev_args, ensure_ascii=False) if prev_args else "" + + if current_args_json != prev_args_json: + # Calculate the diff + sent = len(self.streamed_args_for_tool[self.current_tool_id]) + argument_diff = current_args_json[sent:] + + if argument_diff: + calls.append( + ToolCallItem( + tool_index=self.current_tool_id, + name=None, + parameters=argument_diff, + ) + ) + self.streamed_args_for_tool[self.current_tool_id] += argument_diff + + self.prev_tool_call_arr[self.current_tool_id]["arguments"] = current_args + + return StreamingParseResult(normal_text="", calls=calls) + + except Exception as e: + logger.error(f"Error in GLM-4.7 parse_streaming_increment: {e}") + return StreamingParseResult(normal_text="", calls=calls) + + class FunctionCallParser: """ Parser for function/tool calls in model outputs. @@ -1185,6 +1455,7 @@ class FunctionCallParser: ToolCallParserEnum: Dict[str, Type[BaseFormatDetector]] = { "deepseekv3": DeepSeekV3Detector, "deepseekv31": DeepSeekV31Detector, + "glm47": Glm47Detector, "kimi_k2": KimiK2Detector, "llama3": Llama32Detector, "mistral": MistralDetector, diff --git a/lightllm/server/router/model_infer/mode_backend/base_backend.py b/lightllm/server/router/model_infer/mode_backend/base_backend.py index 805c9b8e5..64310d6b0 100644 --- a/lightllm/server/router/model_infer/mode_backend/base_backend.py +++ b/lightllm/server/router/model_infer/mode_backend/base_backend.py @@ -37,6 +37,7 @@ from lightllm.models.deepseek_mtp.model import Deepseek3MTPModel from lightllm.models.qwen3_moe_mtp.model import Qwen3MOEMTPModel from lightllm.models.mistral_mtp.model import MistralMTPModel +from lightllm.models.glm4_moe_lite_mtp.model import Glm4MoeLiteMTPModel from lightllm.server.router.model_infer.mode_backend.generic_post_process import sample from lightllm.common.basemodel.triton_kernel.gather_token_id import scatter_token from lightllm.server.pd_io_struct import NIXLChunckedTransTaskRet @@ -328,6 +329,9 @@ def init_mtp_draft_model(self, main_kvargs: dict): elif mtp_model_cfg["model_type"] == "mistral": assert self.args.mtp_mode in ["vanilla_no_att", "eagle_no_att"] self.draft_models.append(MistralMTPModel(mtp_model_kvargs)) + elif mtp_model_cfg["model_type"] == "glm4_moe_lite": + assert self.args.mtp_mode in ["vanilla_with_att", "eagle_with_att"] + self.draft_models.append(Glm4MoeLiteMTPModel(mtp_model_kvargs)) else: assert False, f"error mtp mode {mtp_model_cfg['model_type']}" diff --git a/test/eval/eval_bfcl.py b/test/eval/eval_bfcl.py new file mode 100755 index 000000000..59b81c26d --- /dev/null +++ b/test/eval/eval_bfcl.py @@ -0,0 +1,686 @@ +#!/usr/bin/env python3 +""" +BFCL (Berkeley Function Calling Leaderboard) Evaluation Script for LightLLM + +This script evaluates function/tool calling capabilities on the BFCL benchmark. + +Usage: + # Start LightLLM server first: + python -m lightllm.server.api_server --model_dir /path/to/GLM-4.7-Flash --tp 1 + + # Run evaluation: + python eval_bfcl.py \ + --model_name GLM-4.7-Flash \ + --base_url http://localhost:8000/v1 \ + --test_category simple + +Test Categories: + - simple: Single function calls (400 examples) + - multiple: Select one function from multiple options (200 examples) + - parallel: Multiple function calls in parallel (200 examples) + - parallel_multiple: Combination of parallel and multiple (200 examples) + - java: Java function calls (100 examples) + - javascript: JavaScript function calls (70 examples) + - irrelevance: Detect when no function should be called + - all: Run all categories + +Requirements: + pip install openai tqdm huggingface_hub +""" + +import argparse +import json +import os +import re +import ast +import time +from concurrent.futures import ThreadPoolExecutor, as_completed +from typing import List, Dict, Any, Optional, Tuple +from dataclasses import dataclass +from collections import defaultdict + +from tqdm import tqdm + +try: + from openai import OpenAI +except ImportError: + print("Please install openai: pip install openai") + exit(1) + +try: + from huggingface_hub import hf_hub_download +except ImportError: + print("Please install huggingface_hub: pip install huggingface_hub") + exit(1) + + +# BFCL Dataset on HuggingFace +BFCL_REPO = "gorilla-llm/Berkeley-Function-Calling-Leaderboard" + +# Test category mappings to filenames +TEST_CATEGORIES = { + "simple": "BFCL_v3_simple.json", + "multiple": "BFCL_v3_multiple.json", + "parallel": "BFCL_v3_parallel.json", + "parallel_multiple": "BFCL_v3_parallel_multiple.json", + "java": "BFCL_v3_java.json", + "javascript": "BFCL_v3_javascript.json", + "irrelevance": "BFCL_v3_irrelevance.json", + "live_simple": "BFCL_v3_live_simple.json", + "live_multiple": "BFCL_v3_live_multiple.json", + "live_parallel": "BFCL_v3_live_parallel.json", + "live_parallel_multiple": "BFCL_v3_live_parallel_multiple.json", + "rest": "BFCL_v3_rest.json", + "sql": "BFCL_v3_sql.json", +} + +# Possible answer files for ground truth +ANSWER_FILES = { + "simple": "possible_answer/BFCL_v3_simple.json", + "multiple": "possible_answer/BFCL_v3_multiple.json", + "parallel": "possible_answer/BFCL_v3_parallel.json", + "parallel_multiple": "possible_answer/BFCL_v3_parallel_multiple.json", + "java": "possible_answer/BFCL_v3_java.json", + "javascript": "possible_answer/BFCL_v3_javascript.json", + "live_simple": "possible_answer/BFCL_v3_live_simple.json", + "live_multiple": "possible_answer/BFCL_v3_live_multiple.json", + "live_parallel": "possible_answer/BFCL_v3_live_parallel.json", + "live_parallel_multiple": "possible_answer/BFCL_v3_live_parallel_multiple.json", + "sql": "possible_answer/BFCL_v3_sql.json", +} + + +@dataclass +class EvalResult: + """Result of a single evaluation.""" + + task_id: str + category: str + passed: bool + model_output: str + expected: Any + error: Optional[str] = None + + +def download_bfcl_file(filename: str) -> str: + """Download a BFCL file from HuggingFace Hub.""" + try: + local_path = hf_hub_download( + repo_id=BFCL_REPO, + filename=filename, + repo_type="dataset", + ) + return local_path + except Exception as e: + print(f"Error downloading {filename}: {e}") + return None + + +def load_jsonl_or_json(filepath: str) -> List[Dict[str, Any]]: + """Load data from JSON or JSONL file.""" + data = [] + with open(filepath, "r", encoding="utf-8") as f: + content = f.read().strip() + # Try as JSON array first + try: + data = json.loads(content) + if isinstance(data, dict): + data = [data] + except json.JSONDecodeError: + # Try as JSONL + f.seek(0) + for line in f: + line = line.strip() + if line: + try: + data.append(json.loads(line)) + except json.JSONDecodeError: + continue + return data + + +def load_bfcl_data(category: str, limit: Optional[int] = None) -> List[Dict[str, Any]]: + """Load BFCL dataset for a specific category.""" + filename = TEST_CATEGORIES.get(category) + if not filename: + print(f"Unknown category: {category}") + return [] + + print(f"Downloading {filename} from HuggingFace...") + filepath = download_bfcl_file(filename) + if not filepath: + return [] + + print(f"Loading data from {filepath}") + data = load_jsonl_or_json(filepath) + + # Also load ground truth answers if available + answer_file = ANSWER_FILES.get(category) + if answer_file: + print(f"Downloading answer file {answer_file}...") + answer_path = download_bfcl_file(answer_file) + if answer_path: + answers = load_jsonl_or_json(answer_path) + # Create a mapping from id to answer + answer_map = {} + for ans in answers: + ans_id = ans.get("id", "") + answer_map[ans_id] = ans.get("ground_truth", ans.get("result", [])) + + # Merge answers into data + for item in data: + item_id = item.get("id", "") + if item_id in answer_map: + item["ground_truth"] = answer_map[item_id] + + if limit: + data = data[:limit] + + print(f"Loaded {len(data)} examples for category: {category}") + return data + + +def fix_schema_types(schema: Any) -> Any: + """ + Fix Python type names to JSON Schema types. + BFCL uses Python type names like 'dict', 'list' but JSON Schema needs 'object', 'array'. + """ + if isinstance(schema, dict): + result = {} + for key, value in schema.items(): + if key == "type" and isinstance(value, str): + # Map Python types to JSON Schema types + type_mapping = { + "dict": "object", + "list": "array", + "str": "string", + "int": "integer", + "float": "number", + "bool": "boolean", + "NoneType": "null", + "tuple": "array", + } + result[key] = type_mapping.get(value, value) + else: + result[key] = fix_schema_types(value) + return result + elif isinstance(schema, list): + return [fix_schema_types(item) for item in schema] + else: + return schema + + +def convert_to_openai_tools(functions: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Convert BFCL function format to OpenAI tools format.""" + tools = [] + for func in functions: + if isinstance(func, str): + func = json.loads(func) + + # Fix the parameters schema to use valid JSON Schema types + parameters = fix_schema_types(func.get("parameters", {})) + + tool = { + "type": "function", + "function": { + "name": func.get("name", ""), + "description": func.get("description", ""), + "parameters": parameters, + }, + } + tools.append(tool) + return tools + + +def parse_function_call(response: str) -> List[Dict[str, Any]]: + """Parse function calls from model response.""" + calls = [] + + # Try to parse as JSON array + try: + parsed = json.loads(response) + if isinstance(parsed, list): + return parsed + elif isinstance(parsed, dict): + return [parsed] + except json.JSONDecodeError: + pass + + # Try to find function call patterns + # Pattern 1: function_name(args) + func_pattern = r"(\w+)\s*\((.*?)\)" + matches = re.findall(func_pattern, response, re.DOTALL) + for name, args_str in matches: + try: + # Try to parse args as Python dict/kwargs + args_str = args_str.strip() + if args_str: + # Convert to dict format + args = eval(f"dict({args_str})") + else: + args = {} + calls.append({"name": name, "arguments": args}) + except: + pass + + # Pattern 2: JSON-like tool_calls + tool_call_pattern = r'\{"name":\s*"([^"]+)",\s*"arguments":\s*(\{[^}]+\})\}' + matches = re.findall(tool_call_pattern, response) + for name, args_str in matches: + try: + args = json.loads(args_str) + calls.append({"name": name, "arguments": args}) + except: + pass + + return calls + + +def extract_tool_calls_from_response(response) -> List[Dict[str, Any]]: + """Extract tool calls from OpenAI API response.""" + calls = [] + + if hasattr(response, "choices") and response.choices: + choice = response.choices[0] + message = choice.message + + # Check for tool_calls in response + if hasattr(message, "tool_calls") and message.tool_calls: + for tool_call in message.tool_calls: + func = tool_call.function + try: + args = json.loads(func.arguments) if func.arguments else {} + except json.JSONDecodeError: + args = {} + calls.append({"name": func.name, "arguments": args}) + + # Also check content for function calls (some models output in content) + if hasattr(message, "content") and message.content: + content_calls = parse_function_call(message.content) + if content_calls and not calls: + calls = content_calls + + return calls + + +def normalize_value(value: Any) -> Any: + """Normalize values for comparison.""" + if isinstance(value, str): + # Try to parse as number + try: + return float(value) + except ValueError: + return value.lower().strip() + elif isinstance(value, bool): + return value + elif isinstance(value, (int, float)): + return float(value) + elif isinstance(value, list): + return [normalize_value(v) for v in value] + elif isinstance(value, dict): + return {k: normalize_value(v) for k, v in value.items()} + return value + + +def value_matches_expected(predicted_value: Any, expected_values: Any) -> bool: + """ + Check if predicted value matches expected value(s). + BFCL format: expected values can be a list of acceptable values. + """ + # Normalize predicted value + pred_normalized = normalize_value(predicted_value) + + # If expected is a list, check if predicted matches any item + if isinstance(expected_values, list): + for exp_val in expected_values: + exp_normalized = normalize_value(exp_val) + if pred_normalized == exp_normalized: + return True + # Also try string comparison for edge cases + if str(pred_normalized) == str(exp_normalized): + return True + return False + else: + exp_normalized = normalize_value(expected_values) + return pred_normalized == exp_normalized or str(pred_normalized) == str(exp_normalized) + + +def compare_function_calls( + predicted: List[Dict[str, Any]], expected: List[Dict[str, Any]], strict: bool = False +) -> Tuple[bool, str]: + """Compare predicted function calls with expected ones.""" + if not predicted and not expected: + return True, "" + + if len(predicted) != len(expected): + return False, f"Count mismatch: predicted {len(predicted)}, expected {len(expected)}" + + # Sort by function name for comparison + pred_sorted = sorted(predicted, key=lambda x: x.get("name", "")) + exp_sorted = sorted(expected, key=lambda x: x.get("name", "")) + + for pred, exp in zip(pred_sorted, exp_sorted): + pred_name = pred.get("name", "") + exp_name = exp.get("name", "") + + if pred_name != exp_name: + return False, f"Function name mismatch: {pred_name} vs {exp_name}" + + pred_args = pred.get("arguments", {}) + exp_args = exp.get("arguments", {}) + + # Check required arguments match (BFCL format: values are lists of acceptable values) + for key, expected_values in exp_args.items(): + if key not in pred_args: + return False, f"Missing argument {key} in {pred_name}" + if not value_matches_expected(pred_args[key], expected_values): + return False, f"Argument {key} mismatch in {pred_name}" + + return True, "" + + +def parse_expected_output(ground_truth: Any) -> List[Dict[str, Any]]: + """ + Parse expected output from BFCL ground truth. + + BFCL format: [{"func_name": {"arg1": [val1, val2], "arg2": [val3]}}] + Convert to: [{"name": "func_name", "arguments": {"arg1": [val1, val2], "arg2": [val3]}}] + """ + if isinstance(ground_truth, str): + try: + ground_truth = json.loads(ground_truth) + except json.JSONDecodeError: + # Try parsing as Python literal + try: + ground_truth = ast.literal_eval(ground_truth) + except: + return [] + + if not ground_truth: + return [] + + # Ensure it's a list + if isinstance(ground_truth, dict): + ground_truth = [ground_truth] + + result = [] + for item in ground_truth: + if isinstance(item, dict): + # Check if it's already in standard format {"name": ..., "arguments": ...} + if "name" in item and "arguments" in item: + result.append(item) + else: + # BFCL format: {"func_name": {"arg1": [v1], "arg2": [v2]}} + for func_name, args in item.items(): + if isinstance(args, dict): + result.append({"name": func_name, "arguments": args}) + else: + # Handle edge case where args might not be a dict + result.append({"name": func_name, "arguments": {}}) + + return result + + +class BFCLEvaluator: + """BFCL Benchmark Evaluator using OpenAI-compatible API.""" + + def __init__( + self, + base_url: str, + model_name: str, + api_key: str = "EMPTY", + max_tokens: int = 1024, + temperature: float = 0.0, + ): + self.client = OpenAI(base_url=base_url, api_key=api_key) + self.model_name = model_name + self.max_tokens = max_tokens + self.temperature = temperature + + def generate_response( + self, prompt: str, tools: List[Dict[str, Any]], system_prompt: Optional[str] = None + ) -> Tuple[Any, List[Dict[str, Any]]]: + """Generate response from the model with tool calling.""" + messages = [] + + if system_prompt: + messages.append({"role": "system", "content": system_prompt}) + + messages.append({"role": "user", "content": prompt}) + + try: + response = self.client.chat.completions.create( + model=self.model_name, + messages=messages, + tools=tools if tools else None, + tool_choice="auto" if tools else None, + max_tokens=self.max_tokens, + temperature=self.temperature, + ) + tool_calls = extract_tool_calls_from_response(response) + return response, tool_calls + except Exception as e: + print(f"API Error: {e}") + return None, [] + + def evaluate_single(self, item: Dict[str, Any], category: str) -> EvalResult: + """Evaluate a single BFCL example.""" + task_id = item.get("id", "unknown") + + # Extract question and functions + question = item.get("question", [[{"role": "user", "content": ""}]]) + if isinstance(question, str): + prompt = question + elif isinstance(question, list) and question: + if isinstance(question[0], dict): + prompt = question[0].get("content", "") + elif isinstance(question[0], list) and question[0]: + prompt = question[0][0].get("content", "") + else: + prompt = str(question[0]) + else: + prompt = str(question) + + # Get functions + functions = item.get("function", []) + if isinstance(functions, str): + try: + functions = json.loads(functions) + except: + functions = [] + + if not isinstance(functions, list): + functions = [functions] + + # Convert to OpenAI tools format + tools = convert_to_openai_tools(functions) + + # Get expected output + ground_truth = item.get("ground_truth", item.get("answer", [])) + expected = parse_expected_output(ground_truth) + + # Generate response + system_prompt = ( + "You are a helpful assistant that can use tools/functions to help answer questions. " + "When you need to call a function, use the provided tools." + ) + + response, predicted_calls = self.generate_response(prompt, tools, system_prompt) + + if response is None: + return EvalResult( + task_id=task_id, + category=category, + passed=False, + model_output="", + expected=expected, + error="API call failed", + ) + + # For irrelevance category, model should NOT call any function + if "irrelevance" in category.lower(): + passed = len(predicted_calls) == 0 + error = "Model called function when it shouldn't" if not passed else None + else: + # Compare function calls + passed, error = compare_function_calls(predicted_calls, expected) + + model_output = json.dumps(predicted_calls, indent=2) if predicted_calls else str(response) + + return EvalResult( + task_id=task_id, category=category, passed=passed, model_output=model_output, expected=expected, error=error + ) + + def evaluate_category(self, category: str, limit: Optional[int] = None, num_workers: int = 4) -> Dict[str, Any]: + """Evaluate all examples in a category.""" + print(f"\nLoading BFCL dataset for category: {category}") + data = load_bfcl_data(category, limit) + + if not data: + print(f"No data found for category: {category}") + return {"category": category, "total": 0, "passed": 0, "accuracy": 0.0} + + print(f"Loaded {len(data)} examples") + + results = [] + + # Use ThreadPoolExecutor for concurrent evaluation + with ThreadPoolExecutor(max_workers=num_workers) as executor: + futures = {executor.submit(self.evaluate_single, item, category): item for item in data} + + for future in tqdm(as_completed(futures), total=len(futures), desc=f"Evaluating {category}"): + try: + result = future.result() + results.append(result) + except Exception as e: + print(f"Error evaluating: {e}") + + # Calculate metrics + total = len(results) + passed = sum(1 for r in results if r.passed) + accuracy = passed / total * 100 if total > 0 else 0.0 + + # Collect errors for analysis + errors = defaultdict(int) + for r in results: + if not r.passed and r.error: + errors[r.error[:50]] += 1 + + return { + "category": category, + "total": total, + "passed": passed, + "accuracy": accuracy, + "results": results, + "error_summary": dict(errors), + } + + +def main(): + parser = argparse.ArgumentParser(description="BFCL Evaluation for LightLLM") + parser.add_argument("--model_name", type=str, required=True, help="Model name") + parser.add_argument( + "--base_url", type=str, default="http://localhost:8000/v1", help="OpenAI-compatible API base URL" + ) + parser.add_argument("--api_key", type=str, default="EMPTY", help="API key (use EMPTY for local)") + parser.add_argument( + "--test_category", + type=str, + default="simple", + choices=list(TEST_CATEGORIES.keys()) + ["all"], + help="Test category to evaluate", + ) + parser.add_argument("--limit", type=int, default=None, help="Limit number of examples (for testing)") + parser.add_argument("--num_workers", type=int, default=4, help="Number of concurrent workers") + parser.add_argument("--max_tokens", type=int, default=1024, help="Maximum tokens to generate") + parser.add_argument("--temperature", type=float, default=0.0, help="Sampling temperature") + parser.add_argument("--output", "-o", type=str, default=None, help="Output file for detailed results") + + args = parser.parse_args() + + print("=" * 60) + print("BFCL (Berkeley Function Calling Leaderboard) Evaluation") + print("=" * 60) + print(f"Model: {args.model_name}") + print(f"API URL: {args.base_url}") + print(f"Test Category: {args.test_category}") + print() + + evaluator = BFCLEvaluator( + base_url=args.base_url, + model_name=args.model_name, + api_key=args.api_key, + max_tokens=args.max_tokens, + temperature=args.temperature, + ) + + # Determine categories to evaluate + if args.test_category == "all": + categories = list(TEST_CATEGORIES.keys()) + else: + categories = [args.test_category] + + all_results = {} + + for category in categories: + result = evaluator.evaluate_category(category, limit=args.limit, num_workers=args.num_workers) + all_results[category] = result + + print(f"\n{category.upper()} Results:") + print(f" Total: {result['total']}") + print(f" Passed: {result['passed']}") + print(f" Accuracy: {result['accuracy']:.2f}%") + + if result.get("error_summary"): + print(" Common errors:") + for error, count in sorted(result["error_summary"].items(), key=lambda x: -x[1])[:5]: + print(f" - {error}: {count}") + + # Print summary + print("\n" + "=" * 60) + print("SUMMARY") + print("=" * 60) + print(f"{'Category':<25} {'Total':>8} {'Passed':>8} {'Accuracy':>10}") + print("-" * 60) + + total_all = 0 + passed_all = 0 + + for category, result in all_results.items(): + print(f"{category:<25} {result['total']:>8} {result['passed']:>8} {result['accuracy']:>9.2f}%") + total_all += result["total"] + passed_all += result["passed"] + + if len(all_results) > 1: + print("-" * 60) + overall_acc = passed_all / total_all * 100 if total_all > 0 else 0 + print(f"{'OVERALL':<25} {total_all:>8} {passed_all:>8} {overall_acc:>9.2f}%") + + print("=" * 60) + + # Save detailed results + if args.output: + output_data = { + "model": args.model_name, + "config": { + "base_url": args.base_url, + "max_tokens": args.max_tokens, + "temperature": args.temperature, + }, + "results": { + cat: { + "total": r["total"], + "passed": r["passed"], + "accuracy": r["accuracy"], + "error_summary": r.get("error_summary", {}), + } + for cat, r in all_results.items() + }, + } + with open(args.output, "w") as f: + json.dump(output_data, f, indent=2) + print(f"\nResults saved to {args.output}") + + +if __name__ == "__main__": + main() diff --git a/test/eval/requirements.txt b/test/eval/requirements.txt new file mode 100644 index 000000000..e57d2da41 --- /dev/null +++ b/test/eval/requirements.txt @@ -0,0 +1,13 @@ +# Evaluation benchmark dependencies +aiohttp>=3.8.0 +tqdm>=4.64.0 +transformers>=4.30.0 +numpy>=1.21.0 +openai>=1.0.0 +huggingface_hub>=0.20.0 + +# Optional: official human-eval package for dataset loading +# pip install git+https://github.com/openai/human-eval.git + +# Optional: official BFCL evaluation package +# pip install bfcl-eval diff --git a/test/eval/run_bfcl.sh b/test/eval/run_bfcl.sh new file mode 100755 index 000000000..2e68f8380 --- /dev/null +++ b/test/eval/run_bfcl.sh @@ -0,0 +1,48 @@ +#!/bin/bash +# BFCL (Berkeley Function Calling Leaderboard) evaluation script for LightLLM +# +# Prerequisites: +# 1. Start LightLLM server: +# python -m lightllm.server.api_server \ +# --model_dir /path/to/GLM-4.7-Flash \ +# --tp 1 \ +# --port 8000 +# +# 2. Install dependencies: +# pip install openai tqdm datasets + +set -e + +# Configuration +MODEL_NAME="${MODEL_NAME:-GLM-4.7-Flash}" +BASE_URL="${BASE_URL:-http://localhost:8000/v1}" +PORT="${PORT:-8000}" +TEST_CATEGORY="${TEST_CATEGORY:-simple}" +NUM_WORKERS="${NUM_WORKERS:-4}" + +# Check if server is running +if ! curl -s "http://localhost:${PORT}/health" > /dev/null 2>&1; then + echo "Error: LightLLM server not running on port ${PORT}" + echo "Start the server first with:" + echo " python -m lightllm.server.api_server --model_dir /path/to/model --tp 1 --port ${PORT}" + exit 1 +fi + +echo "==========================================" +echo "BFCL Function Calling Evaluation" +echo "==========================================" +echo "Model: ${MODEL_NAME}" +echo "Server: ${BASE_URL}" +echo "Test Category: ${TEST_CATEGORY}" +echo "" + +# Run evaluation +python "$(dirname "$0")/eval_bfcl.py" \ + --model_name "${MODEL_NAME}" \ + --base_url "${BASE_URL}" \ + --test_category "${TEST_CATEGORY}" \ + --num_workers "${NUM_WORKERS}" \ + --output "bfcl_results_${TEST_CATEGORY}_$(date +%Y%m%d_%H%M%S).json" + +echo "" +echo "Evaluation complete!"