cleanup

Teagan42 · Teagan42 · commit 32e4ee8da25d · 2025-10-23T08:10:50.000-06:00
diff --git a/benches/harness.py b/benches/harness.py
@@ -147,19 +147,22 @@ async def run_bench(config_path: str):
     }
     # Custom percentiles
     # safe summary computation if pandas isn't available
-    if df is not None:
+    if df is not None and len(df) > 0:
         qtiles = df['latency_s'].quantile([0.5, 0.95]).to_dict() if 'latency_s' in df else {}
-        n_samples = len(df)
+        n_samples = int(len(df))
         latency_mean = float(df['latency_s'].mean()) if 'latency_s' in df else None
-        latency_p50 = float(qtiles.get(0.5)) if qtiles else None
-        latency_p95 = float(qtiles.get(0.95)) if qtiles else None
-        throughput = len(df) / df['latency_s'].sum() if 'latency_s' in df else None
-        em_mean = float(df['em'].mean()) if 'em' in df else None
-        f1_mean = float(df['f1'].mean()) if 'f1' in df else None
-        acc_mean = float(df['acc'].mean()) if 'acc' in df else None
-        bleu_mean = float(df['bleu'].mean()) if 'bleu' in df else None
-        rouge_mean = float(df['rougeL'].mean()) if 'rougeL' in df else None
-        tokens_mean = float(df['eval_count'].mean()) if 'eval_count' in df else None
+        latency_p50 = float(qtiles.get(0.5)) if qtiles and qtiles.get(0.5) is not None else None
+        latency_p95 = float(qtiles.get(0.95)) if qtiles and qtiles.get(0.95) is not None else None
+        try:
+            throughput = len(df) / float(df['latency_s'].sum()) if 'latency_s' in df and df['latency_s'].sum() else None
+        except Exception:
+            throughput = None
+        em_mean = float(df['em'].mean()) if 'em' in df and df['em'].count() > 0 else None
+        f1_mean = float(df['f1'].mean()) if 'f1' in df and df['f1'].count() > 0 else None
+        acc_mean = float(df['acc'].mean()) if 'acc' in df and df['acc'].count() > 0 else None
+        bleu_mean = float(df['bleu'].mean()) if 'bleu' in df and df['bleu'].count() > 0 else None
+        rouge_mean = float(df['rougeL'].mean()) if 'rougeL' in df and df['rougeL'].count() > 0 else None
+        tokens_mean = float(df['eval_count'].mean()) if 'eval_count' in df and df['eval_count'].count() > 0 else None
     else:
         n_samples = len(results)
         latency_mean = latency_p50 = latency_p95 = throughput = None
diff --git a/benches/metrics.py b/benches/metrics.py
@@ -16,9 +16,17 @@
     "lower_strip": lambda s: s.lower().strip(),
 }
 
-def normalize(s: str, kind: Optional[str]) -> str:
+
+def normalize(s: Optional[str], kind: Optional[str]) -> str:
+    """Normalize a string safely. Treat None as empty string."""
+    if s is None:
+        s = ""
     fn = NORMALIZERS.get(kind)
-    return fn(s) if fn else s
+    try:
+        return fn(s) if fn else s
+    except Exception:
+        # if normalization fails, return the original string
+        return s
 
 def exact_match(pred: str, gold: str, norm: Optional[str]) -> float:
     return 1.0 if normalize(pred, norm) == normalize(gold, norm) else 0.0
@@ -50,12 +58,17 @@ def rouge_l(pred: str, gold: str) -> Optional[float]:
     scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
     return scorer.score(gold, pred)['rougeL'].fmeasure
 
-def mc_accuracy(pred: str, answer: str, choices: List[str], norm: Optional[str]) -> float:
+def mc_accuracy(pred: Optional[str], answer: Optional[str], choices: List[str], norm: Optional[str]) -> float:
     p = normalize(pred, norm)
+    if not choices:
+        return 0.0
     candidates = [normalize(c, norm) for c in choices]
     # greedy: pick exact first, else contains
     if p in candidates:
         return 1.0 if p == normalize(answer, norm) else 0.0
-    # fuzzy contains: prefer exact, then check if candidate is substring of pred or pred is substring of candidate
-    best = max(candidates, key=lambda c: ((c in p) or (p in c), -abs(len(c)-len(p))))
+    # fuzzy contains: prefer candidate contained in pred or vice versa, then length closeness
+    try:
+        best = max(candidates, key=lambda c: ((c in p) or (p in c), -abs(len(c) - len(p))))
+    except Exception:
+        best = candidates[0]
     return 1.0 if best == normalize(answer, norm) else 0.0
diff --git a/benches/monitor.py b/benches/monitor.py
@@ -14,9 +14,9 @@
 _NV_OK = False
 pynvml = None
 if not (os.environ.get('LLM_BENCH_SKIP_GPU') == '1'):
-    # prefer newer package name if available
+    # try to initialize NVML if available; prefer nvidia-ml-py, fall back to pynvml
     try:
-        import nvidia_ml_py as nvml_pkg  # type: ignore
+        import nvidia_ml_py as nvml_pkg  # preferred package
         try:
             nvml_pkg.nvmlInit()
             pynvml = nvml_pkg
@@ -25,7 +25,6 @@
             _NV_OK = False
     except Exception:
         try:
-            # suppress deprecation FutureWarning from pynvml package if present
             import warnings
             with warnings.catch_warnings():
                 warnings.simplefilter('ignore', FutureWarning)
@@ -75,7 +74,7 @@ def _run(self):
             except Exception:
                 p = None
         nv_dev = None
-        if _NV_OK:
+        if _NV_OK and pynvml:
             try:
                 nv_dev = pynvml.nvmlDeviceGetHandleByIndex(self.gpu_index)
             except Exception:
@@ -99,10 +98,14 @@ def _run(self):
                     cpu = 0.0
                 mem = 0.0
             gpu_util = vram = None
-            if nv_dev:
-                util = pynvml.nvmlDeviceGetUtilizationRates(nv_dev)
-                memi = pynvml.nvmlDeviceGetMemoryInfo(nv_dev)
-                gpu_util = float(util.gpu)
-                vram = float(memi.used) / (1024**2)
+            if nv_dev and pynvml and _NV_OK:
+                try:
+                    util = pynvml.nvmlDeviceGetUtilizationRates(nv_dev)
+                    memi = pynvml.nvmlDeviceGetMemoryInfo(nv_dev)
+                    gpu_util = float(getattr(util, 'gpu', 0.0))
+                    vram = float(getattr(memi, 'used', 0)) / (1024**2)
+                except Exception:
+                    gpu_util = None
+                    vram = None
             self.samples.append(Sample(time.time(), cpu, mem, gpu_util, vram))
             time.sleep(self.interval_s)
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,12 @@
+[project]
+name = "llm-bench"
+version = "0.0.0"
+description = "Lightweight LLM benchmark harness"
+requires-python = ">=3.10"
+
+[project.optional-dependencies]
+gpu = ["nvidia-ml-py"]
+
+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"