constructorfleet
diff --git a/‎.github/workflows/ci.yml‎
Lines changed: 22 additions & 0 deletions b/‎.github/workflows/ci.yml‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎Makefile‎
Lines changed: 12 additions & 0 deletions b/‎Makefile‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 9 additions & 0 deletions b/‎README.md‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎bench_config.yaml‎
Lines changed: 22 additions & 0 deletions b/‎bench_config.yaml‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎benches/__init__.py‎
Lines changed: 9 additions & 0 deletions b/‎benches/__init__.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎benches/harness.py‎
Lines changed: 203 additions & 0 deletions b/‎benches/harness.py‎
Lines changed: 203 additions & 0 deletions
diff --git a/‎benches/metrics.py‎
Lines changed: 61 additions & 0 deletions b/‎benches/metrics.py‎
Lines changed: 61 additions & 0 deletions
@@ -0,0 +1,22 @@
+name: CI
+
+on: [push, pull_request]
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: [3.10, 3.11, 3.12]
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install deps
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements-dev.txt
+      - name: Run tests
+        run: make test
@@ -0,0 +1,3 @@
+__pycache__/
+*.pyc
+reports/
@@ -0,0 +1,12 @@
+.PHONY: test run
+
+test:
+	python -m pytest -q
+
+run:
+	python - <<'PY'
+import asyncio, sys
+sys.path.insert(0, '')
+from benches.harness import run_bench
+asyncio.run(run_bench('bench_config.yaml'))
+PY
@@ -0,0 +1,9 @@
+# LLM Benchmark Suite (Accuracy • Speed • Memory)
+
+Lightweight harness to benchmark LLM providers locally. See `bench_config.yaml` for an example run and `datasets/` for tiny test datasets.
+
+Run:
+
+```bash
+python scripts/run_bench.py -c bench_config.yaml
+```
@@ -0,0 +1,22 @@
+run_name: demo_mock
+provider:
+  kind: mock
+io:
+  dataset_path: datasets/qa_tiny.jsonl
+  output_prefix: reports/demo_mock
+prompt:
+  system: "You are a concise assistant."
+  template: |
+    Answer the question. If unknown, say "unknown".
+    Q: {input}
+    A:
+limits:
+  max_samples: 10
+  timeout_s: 60
+load:
+  batch_size: 1
+  concurrency: 2
+metrics:
+  bleu: false
+  rougeL: false
+  normalization: "lower_strip"
@@ -0,0 +1,9 @@
+"""llm-bench benches package"""
+
+__all__ = [
+    "providers",
+    "metrics",
+    "monitor",
+    "util",
+    "harness",
+]
@@ -0,0 +1,203 @@
+from __future__ import annotations
+import asyncio, json, os, signal
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional
+from .providers import OllamaProvider, OpenAIStyleProvider
+from .metrics import exact_match, token_f1, bleu, rouge_l, mc_accuracy
+from .monitor import ResourceMonitor
+from .util import stopwatch, write_jsonl
+
+# optional heavy deps are imported lazily inside run_bench
+
+
+@dataclass
+class BenchConfig:
+    run_name: str
+    provider: Dict[str, Any]
+    io: Dict[str, Any]
+    prompt: Dict[str, Any]
+    limits: Dict[str, Any]
+    load: Dict[str, Any]
+    metrics: Dict[str, Any]
+
+
+async def _load_provider(cfg: Dict[str, Any]):
+    kind = cfg.get('kind')
+    if kind == 'ollama':
+        return OllamaProvider(cfg['base_url'], cfg['model'])
+    if kind == 'openai':
+        return OpenAIStyleProvider(cfg['api_base'], cfg['model'], cfg.get('api_key'))
+    if kind == 'mock':
+        from .providers import MockProvider
+        return MockProvider()
+    raise ValueError(f"Unknown provider kind: {kind}")
+
+
+async def _worker(name: str, queue: asyncio.Queue, provider, sysmsg, tmpl, options, results, norm):
+    while True:
+        item = await queue.get()
+        if item is None:
+            queue.task_done(); break
+        rec = item
+        prompt = tmpl.format(**rec)
+        with stopwatch() as sw:
+            out = await provider.generate(prompt, system=sysmsg, options=options)
+        latency_s = out.get('latency_s') or sw()
+        text = out['output'].strip()
+        row = {
+            'id': rec.get('id'),
+            'latency_s': latency_s,
+            'output': text,
+            'target': rec.get('target'),
+            'prompt_eval_count': out.get('prompt_eval_count'),
+            'eval_count': out.get('eval_count'),
+        }
+        # accuracy
+        if 'choices' in rec and rec.get('answer') is not None:
+            row['acc'] = mc_accuracy(text, rec['answer'], rec['choices'], norm)
+        if rec.get('target') is not None:
+            row['em'] = exact_match(text, rec['target'], norm)
+            row['f1'] = token_f1(text, rec['target'], norm)
+            b = bleu(text, rec['target'])
+            if b is not None: row['bleu'] = b
+            r = rouge_l(text, rec['target'])
+            if r is not None: row['rougeL'] = r
+        results.append(row)
+        queue.task_done()
+
+
+async def run_bench(config_path: str):
+    import yaml
+    cfg = BenchConfig(**yaml.safe_load(open(config_path)))
+    provider = await _load_provider(cfg.provider)
+
+    try:
+        import orjson
+    except Exception:
+        orjson = None
+
+    # load jsonl as text, skip empty lines
+    rows = []
+    if orjson:
+        with open(cfg.io['dataset_path'], 'r', encoding='utf8') as _f:
+            for line in _f:
+                line = line.strip()
+                if not line:
+                    continue
+                rows.append(orjson.loads(line))
+    else:
+        import json as _json
+        with open(cfg.io['dataset_path'], 'r', encoding='utf8') as _f:
+            for line in _f:
+                line = line.strip()
+                if not line:
+                    continue
+                rows.append(_json.loads(line))
+    if cfg.limits.get('max_samples'):
+        rows = rows[: int(cfg.limits['max_samples'])]
+
+    q = asyncio.Queue()
+    for r in rows:
+        await q.put(r)
+    for _ in range(cfg.load.get('concurrency', 1)):
+        await q.put(None)
+
+    results: List[Dict[str, Any]] = []
+
+    mon = ResourceMonitor(os.getpid(), interval_s=0.2)
+    mon.start()
+
+    workers = [
+        asyncio.create_task(
+            _worker(f"w{i}", q, provider, cfg.prompt.get('system'), cfg.prompt.get('template', '{input}'), cfg.provider.get('options'), results, cfg.metrics.get('normalization'))
+        )
+        for i in range(cfg.load.get('concurrency', 1))
+    ]
+    await asyncio.gather(*workers)
+    mon.stop()
+
+    # write per-sample
+    out_prefix = cfg.io['output_prefix']
+    write_jsonl(f"{out_prefix}.jsonl", results)
+
+    try:
+        import pandas as pd
+        df = pd.DataFrame(results)
+    except Exception:
+        pd = None
+        df = None
+    agg = {
+        'latency_s': ['mean', 'p50', 'p95', 'min', 'max'],
+        'em': 'mean',
+        'f1': 'mean',
+        'acc': 'mean',
+        'bleu': 'mean',
+        'rougeL': 'mean',
+        'eval_count': 'mean',
+    }
+    # Custom percentiles
+    # safe summary computation if pandas isn't available
+    if df is not None:
+        qtiles = df['latency_s'].quantile([0.5, 0.95]).to_dict() if 'latency_s' in df else {}
+        n_samples = len(df)
+        latency_mean = float(df['latency_s'].mean()) if 'latency_s' in df else None
+        latency_p50 = float(qtiles.get(0.5)) if qtiles else None
+        latency_p95 = float(qtiles.get(0.95)) if qtiles else None
+        throughput = len(df) / df['latency_s'].sum() if 'latency_s' in df else None
+        em_mean = float(df['em'].mean()) if 'em' in df else None
+        f1_mean = float(df['f1'].mean()) if 'f1' in df else None
+        acc_mean = float(df['acc'].mean()) if 'acc' in df else None
+        bleu_mean = float(df['bleu'].mean()) if 'bleu' in df else None
+        rouge_mean = float(df['rougeL'].mean()) if 'rougeL' in df else None
+        tokens_mean = float(df['eval_count'].mean()) if 'eval_count' in df else None
+    else:
+        n_samples = len(results)
+        latency_mean = latency_p50 = latency_p95 = throughput = None
+        em_mean = f1_mean = acc_mean = bleu_mean = rouge_mean = tokens_mean = None
+
+    summ = {
+        'n_samples': n_samples,
+        'latency_mean_s': latency_mean,
+        'latency_p50_s': latency_p50,
+        'latency_p95_s': latency_p95,
+        'throughput_req_per_s': throughput,
+        'em_mean': em_mean,
+        'f1_mean': f1_mean,
+        'acc_mean': acc_mean,
+        'bleu_mean': bleu_mean,
+        'rougeL_mean': rouge_mean,
+        'tokens_out_mean': tokens_mean,
+    }
+    try:
+        if pd is not None:
+            pd.DataFrame([summ]).to_csv(f"{out_prefix}_summary.csv", index=False)
+        else:
+            # fallback: write minimal csv using stdlib
+            import csv
+            with open(f"{out_prefix}_summary.csv", 'w', newline='', encoding='utf8') as _f:
+                w = csv.DictWriter(_f, fieldnames=list(summ.keys()))
+                w.writeheader()
+                w.writerow(summ)
+    except Exception:
+        pass
+
+    # memory timeline: try pandas, else write CSV via stdlib
+    try:
+        import pandas as pd
+        mem_df = pd.DataFrame([s.__dict__ for s in mon.samples])
+        mem_df.to_csv(f"{out_prefix}_resources.csv", index=False)
+    except Exception:
+        import csv
+        fields = ['t_s', 'cpu_pct', 'rss_mb', 'gpu_util_pct', 'vram_mb']
+        with open(f"{out_prefix}_resources.csv", 'w', newline='', encoding='utf8') as _f:
+            w = csv.DictWriter(_f, fieldnames=fields)
+            w.writeheader()
+            for s in mon.samples:
+                w.writerow(s.__dict__)
+
+    # markdown report
+    with open(f"{out_prefix}_report.md", 'w') as f:
+        f.write("# LLM Bench Report\n\n")
+        for k, v in summ.items():
+            f.write(f"- **{k}**: {v}\n")
+        f.write("\n## Notes\n- Latency is end‑to‑end per request.\n- Throughput is requests/sec at measured concurrency.\n- See resources.csv for CPU/GPU timeline.\n")
@@ -0,0 +1,61 @@
+from __future__ import annotations
+import re
+from typing import List, Dict, Optional
+
+try:
+    import sacrebleu  # type: ignore
+except Exception:
+    sacrebleu = None
+try:
+    from rouge_score import rouge_scorer
+except Exception:
+    rouge_scorer = None
+
+NORMALIZERS = {
+    None: lambda s: s,
+    "lower_strip": lambda s: s.lower().strip(),
+}
+
+def normalize(s: str, kind: Optional[str]) -> str:
+    fn = NORMALIZERS.get(kind)
+    return fn(s) if fn else s
+
+def exact_match(pred: str, gold: str, norm: Optional[str]) -> float:
+    return 1.0 if normalize(pred, norm) == normalize(gold, norm) else 0.0
+
+def token_f1(pred: str, gold: str, norm: Optional[str]) -> float:
+    p = normalize(pred, norm).split()
+    g = normalize(gold, norm).split()
+    common = {}
+    for tok in p:
+        if tok in g:
+            common[tok] = min(p.count(tok), g.count(tok))
+    num_same = sum(common.values())
+    if len(p) == 0 or len(g) == 0:
+        return float(p == g)
+    if num_same == 0:
+        return 0.0
+    precision = num_same / len(p)
+    recall = num_same / len(g)
+    return 2 * precision * recall / (precision + recall)
+
+def bleu(pred: str, gold: str) -> Optional[float]:
+    if not sacrebleu:
+        return None
+    return sacrebleu.corpus_bleu([pred], [[gold]]).score
+
+def rouge_l(pred: str, gold: str) -> Optional[float]:
+    if not rouge_scorer:
+        return None
+    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
+    return scorer.score(gold, pred)['rougeL'].fmeasure
+
+def mc_accuracy(pred: str, answer: str, choices: List[str], norm: Optional[str]) -> float:
+    p = normalize(pred, norm)
+    candidates = [normalize(c, norm) for c in choices]
+    # greedy: pick exact first, else contains
+    if p in candidates:
+        return 1.0 if p == normalize(answer, norm) else 0.0
+    # fuzzy contains: prefer exact, then check if candidate is substring of pred or pred is substring of candidate
+    best = max(candidates, key=lambda c: ((c in p) or (p in c), -abs(len(c)-len(p))))
+    return 1.0 if best == normalize(answer, norm) else 0.0
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+__pycache__/`
	`2`	`+*.pyc`
	`3`	`+reports/`