Skip to content

Commit 32e4ee8

Browse files
committed
cleanup
1 parent 54e2f49 commit 32e4ee8

File tree

4 files changed

+56
-25
lines changed

4 files changed

+56
-25
lines changed

benches/harness.py

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -147,19 +147,22 @@ async def run_bench(config_path: str):
147147
}
148148
# Custom percentiles
149149
# safe summary computation if pandas isn't available
150-
if df is not None:
150+
if df is not None and len(df) > 0:
151151
qtiles = df['latency_s'].quantile([0.5, 0.95]).to_dict() if 'latency_s' in df else {}
152-
n_samples = len(df)
152+
n_samples = int(len(df))
153153
latency_mean = float(df['latency_s'].mean()) if 'latency_s' in df else None
154-
latency_p50 = float(qtiles.get(0.5)) if qtiles else None
155-
latency_p95 = float(qtiles.get(0.95)) if qtiles else None
156-
throughput = len(df) / df['latency_s'].sum() if 'latency_s' in df else None
157-
em_mean = float(df['em'].mean()) if 'em' in df else None
158-
f1_mean = float(df['f1'].mean()) if 'f1' in df else None
159-
acc_mean = float(df['acc'].mean()) if 'acc' in df else None
160-
bleu_mean = float(df['bleu'].mean()) if 'bleu' in df else None
161-
rouge_mean = float(df['rougeL'].mean()) if 'rougeL' in df else None
162-
tokens_mean = float(df['eval_count'].mean()) if 'eval_count' in df else None
154+
latency_p50 = float(qtiles.get(0.5)) if qtiles and qtiles.get(0.5) is not None else None
155+
latency_p95 = float(qtiles.get(0.95)) if qtiles and qtiles.get(0.95) is not None else None
156+
try:
157+
throughput = len(df) / float(df['latency_s'].sum()) if 'latency_s' in df and df['latency_s'].sum() else None
158+
except Exception:
159+
throughput = None
160+
em_mean = float(df['em'].mean()) if 'em' in df and df['em'].count() > 0 else None
161+
f1_mean = float(df['f1'].mean()) if 'f1' in df and df['f1'].count() > 0 else None
162+
acc_mean = float(df['acc'].mean()) if 'acc' in df and df['acc'].count() > 0 else None
163+
bleu_mean = float(df['bleu'].mean()) if 'bleu' in df and df['bleu'].count() > 0 else None
164+
rouge_mean = float(df['rougeL'].mean()) if 'rougeL' in df and df['rougeL'].count() > 0 else None
165+
tokens_mean = float(df['eval_count'].mean()) if 'eval_count' in df and df['eval_count'].count() > 0 else None
163166
else:
164167
n_samples = len(results)
165168
latency_mean = latency_p50 = latency_p95 = throughput = None

benches/metrics.py

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,17 @@
1616
"lower_strip": lambda s: s.lower().strip(),
1717
}
1818

19-
def normalize(s: str, kind: Optional[str]) -> str:
19+
20+
def normalize(s: Optional[str], kind: Optional[str]) -> str:
21+
"""Normalize a string safely. Treat None as empty string."""
22+
if s is None:
23+
s = ""
2024
fn = NORMALIZERS.get(kind)
21-
return fn(s) if fn else s
25+
try:
26+
return fn(s) if fn else s
27+
except Exception:
28+
# if normalization fails, return the original string
29+
return s
2230

2331
def exact_match(pred: str, gold: str, norm: Optional[str]) -> float:
2432
return 1.0 if normalize(pred, norm) == normalize(gold, norm) else 0.0
@@ -50,12 +58,17 @@ def rouge_l(pred: str, gold: str) -> Optional[float]:
5058
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
5159
return scorer.score(gold, pred)['rougeL'].fmeasure
5260

53-
def mc_accuracy(pred: str, answer: str, choices: List[str], norm: Optional[str]) -> float:
61+
def mc_accuracy(pred: Optional[str], answer: Optional[str], choices: List[str], norm: Optional[str]) -> float:
5462
p = normalize(pred, norm)
63+
if not choices:
64+
return 0.0
5565
candidates = [normalize(c, norm) for c in choices]
5666
# greedy: pick exact first, else contains
5767
if p in candidates:
5868
return 1.0 if p == normalize(answer, norm) else 0.0
59-
# fuzzy contains: prefer exact, then check if candidate is substring of pred or pred is substring of candidate
60-
best = max(candidates, key=lambda c: ((c in p) or (p in c), -abs(len(c)-len(p))))
69+
# fuzzy contains: prefer candidate contained in pred or vice versa, then length closeness
70+
try:
71+
best = max(candidates, key=lambda c: ((c in p) or (p in c), -abs(len(c) - len(p))))
72+
except Exception:
73+
best = candidates[0]
6174
return 1.0 if best == normalize(answer, norm) else 0.0

benches/monitor.py

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,9 @@
1414
_NV_OK = False
1515
pynvml = None
1616
if not (os.environ.get('LLM_BENCH_SKIP_GPU') == '1'):
17-
# prefer newer package name if available
17+
# try to initialize NVML if available; prefer nvidia-ml-py, fall back to pynvml
1818
try:
19-
import nvidia_ml_py as nvml_pkg # type: ignore
19+
import nvidia_ml_py as nvml_pkg # preferred package
2020
try:
2121
nvml_pkg.nvmlInit()
2222
pynvml = nvml_pkg
@@ -25,7 +25,6 @@
2525
_NV_OK = False
2626
except Exception:
2727
try:
28-
# suppress deprecation FutureWarning from pynvml package if present
2928
import warnings
3029
with warnings.catch_warnings():
3130
warnings.simplefilter('ignore', FutureWarning)
@@ -75,7 +74,7 @@ def _run(self):
7574
except Exception:
7675
p = None
7776
nv_dev = None
78-
if _NV_OK:
77+
if _NV_OK and pynvml:
7978
try:
8079
nv_dev = pynvml.nvmlDeviceGetHandleByIndex(self.gpu_index)
8180
except Exception:
@@ -99,10 +98,14 @@ def _run(self):
9998
cpu = 0.0
10099
mem = 0.0
101100
gpu_util = vram = None
102-
if nv_dev:
103-
util = pynvml.nvmlDeviceGetUtilizationRates(nv_dev)
104-
memi = pynvml.nvmlDeviceGetMemoryInfo(nv_dev)
105-
gpu_util = float(util.gpu)
106-
vram = float(memi.used) / (1024**2)
101+
if nv_dev and pynvml and _NV_OK:
102+
try:
103+
util = pynvml.nvmlDeviceGetUtilizationRates(nv_dev)
104+
memi = pynvml.nvmlDeviceGetMemoryInfo(nv_dev)
105+
gpu_util = float(getattr(util, 'gpu', 0.0))
106+
vram = float(getattr(memi, 'used', 0)) / (1024**2)
107+
except Exception:
108+
gpu_util = None
109+
vram = None
107110
self.samples.append(Sample(time.time(), cpu, mem, gpu_util, vram))
108111
time.sleep(self.interval_s)

pyproject.toml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
[project]
2+
name = "llm-bench"
3+
version = "0.0.0"
4+
description = "Lightweight LLM benchmark harness"
5+
requires-python = ">=3.10"
6+
7+
[project.optional-dependencies]
8+
gpu = ["nvidia-ml-py"]
9+
10+
[build-system]
11+
requires = ["setuptools>=61.0"]
12+
build-backend = "setuptools.build_meta"

0 commit comments

Comments
 (0)