Skip to content

Commit 42d6b1c

Browse files
authored
Merge pull request #321 from DevanshuNEU/fix/eval-path-normalization-calibration
fix: normalize indexed repo paths in eval runner, calibrate baseline
2 parents 1c5a935 + 640907d commit 42d6b1c

2 files changed

Lines changed: 18 additions & 9 deletions

File tree

backend/evals/baseline.json

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
{
2-
"calibrated": false,
2+
"calibrated": true,
33
"tolerance": 0.05,
44
"free_core": {
5-
"recall@10": null,
6-
"mrr": null
5+
"recall@10": 0.8,
6+
"mrr": 0.8
77
},
88
"pro_reranked": {
9-
"recall@10": null,
10-
"mrr": null
9+
"recall@10": 0.85,
10+
"mrr": 0.6583
1111
},
12-
"note": "Set calibrated=true and fill the numbers after the first real `python -m evals` run against a populated index. Until calibrated, the pytest gate skips rather than asserting against a fake baseline."
12+
"note": "Calibrated 2026-06-12 against the OCI repo index (repo_id 78aa181e-..., PINECONE_INDEX_NAME=codeintel) over the 10-query ground-truth set. Free-tier is the CI regression baseline; the pytest gate asserts free recall@10 >= 0.8 - tolerance. Reranking raises recall@10 to 0.85 but lowers MRR to 0.658 on this set (pulls one more expected file into top-10 while demoting rank-1 hits) -- pro tier is not strictly better here. Known ranker misses: q06 (durable repo-state) and q10 (path-filtering). Re-calibrate after any search_v2 ranker/embedding change."
1313
}

backend/evals/runner.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,12 +35,21 @@ def load_baseline(path: Path = BASELINE_PATH) -> dict:
3535
return json.loads(path.read_text())
3636

3737

38-
def _dedupe_files_by_rank(results: List[dict]) -> List[Tuple[str, float]]:
38+
def _to_repo_relative(file_path: str, repo_id: str) -> str:
39+
"""The index stores paths as `repos/<repo_id>/<repo-root-relative>`; ground truth
40+
is repo-root-relative (and stays that way so it's portable across repo_ids). Strip
41+
the exact storage prefix so the two compare. Exact match on repo_id, so we never
42+
over-trim a path that happens to start with `repos/`."""
43+
prefix = f"repos/{repo_id}/"
44+
return file_path[len(prefix):] if file_path.startswith(prefix) else file_path
45+
46+
47+
def _dedupe_files_by_rank(results: List[dict], repo_id: str) -> List[Tuple[str, float]]:
3948
"""search_v2 returns function-level hits; collapse to file-level, keeping the
4049
best (first) rank per file. Returns [(file_path, score)] in rank order."""
4150
seen: Dict[str, float] = {}
4251
for r in results:
43-
fp = r.get("file_path") or ""
52+
fp = _to_repo_relative(r.get("file_path") or "", repo_id)
4453
if fp and fp not in seen:
4554
seen[fp] = float(r.get("score", 0.0))
4655
return list(seen.items())
@@ -113,7 +122,7 @@ async def run_eval(reranking: bool, repo_id_default: str = DEFAULT_REPO_ID) -> D
113122
# no-hit from a swallowed error here, so flag it loudly instead of hiding it.
114123
empties.append(qid)
115124

116-
ranked = _dedupe_files_by_rank(results)
125+
ranked = _dedupe_files_by_rank(results, repo_id)
117126
ranked_paths = [fp for fp, _ in ranked]
118127
run[qid] = {fp: score for fp, score in ranked} or {"__no_results__": 0.0}
119128

0 commit comments

Comments
 (0)