Merge pull request #321 from DevanshuNEU/fix/eval-path-normalization-calibration

DevanshuNEU · web-flow · commit 42d6b1c8510c · 2026-06-12T13:17:48.000-04:00
fix: normalize indexed repo paths in eval runner, calibrate baseline
diff --git a/backend/evals/baseline.json b/backend/evals/baseline.json
@@ -1,13 +1,13 @@
 {
-  "calibrated": false,
+  "calibrated": true,
   "tolerance": 0.05,
   "free_core": {
-    "recall@10": null,
-    "mrr": null
+    "recall@10": 0.8,
+    "mrr": 0.8
   },
   "pro_reranked": {
-    "recall@10": null,
-    "mrr": null
+    "recall@10": 0.85,
+    "mrr": 0.6583
   },
-  "note": "Set calibrated=true and fill the numbers after the first real `python -m evals` run against a populated index. Until calibrated, the pytest gate skips rather than asserting against a fake baseline."
+  "note": "Calibrated 2026-06-12 against the OCI repo index (repo_id 78aa181e-..., PINECONE_INDEX_NAME=codeintel) over the 10-query ground-truth set. Free-tier is the CI regression baseline; the pytest gate asserts free recall@10 >= 0.8 - tolerance. Reranking raises recall@10 to 0.85 but lowers MRR to 0.658 on this set (pulls one more expected file into top-10 while demoting rank-1 hits) -- pro tier is not strictly better here. Known ranker misses: q06 (durable repo-state) and q10 (path-filtering). Re-calibrate after any search_v2 ranker/embedding change."
 }
diff --git a/backend/evals/runner.py b/backend/evals/runner.py
@@ -35,12 +35,21 @@ def load_baseline(path: Path = BASELINE_PATH) -> dict:
     return json.loads(path.read_text())
 
 
-def _dedupe_files_by_rank(results: List[dict]) -> List[Tuple[str, float]]:
+def _to_repo_relative(file_path: str, repo_id: str) -> str:
+    """The index stores paths as `repos/<repo_id>/<repo-root-relative>`; ground truth
+    is repo-root-relative (and stays that way so it's portable across repo_ids). Strip
+    the exact storage prefix so the two compare. Exact match on repo_id, so we never
+    over-trim a path that happens to start with `repos/`."""
+    prefix = f"repos/{repo_id}/"
+    return file_path[len(prefix):] if file_path.startswith(prefix) else file_path
+
+
+def _dedupe_files_by_rank(results: List[dict], repo_id: str) -> List[Tuple[str, float]]:
     """search_v2 returns function-level hits; collapse to file-level, keeping the
     best (first) rank per file. Returns [(file_path, score)] in rank order."""
     seen: Dict[str, float] = {}
     for r in results:
-        fp = r.get("file_path") or ""
+        fp = _to_repo_relative(r.get("file_path") or "", repo_id)
         if fp and fp not in seen:
             seen[fp] = float(r.get("score", 0.0))
     return list(seen.items())
@@ -113,7 +122,7 @@ async def run_eval(reranking: bool, repo_id_default: str = DEFAULT_REPO_ID) -> D
             # no-hit from a swallowed error here, so flag it loudly instead of hiding it.
             empties.append(qid)
 
-        ranked = _dedupe_files_by_rank(results)
+        ranked = _dedupe_files_by_rank(results, repo_id)
         ranked_paths = [fp for fp, _ in ranked]
         run[qid] = {fp: score for fp, score in ranked} or {"__no_results__": 0.0}