Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions backend/evals/baseline.json
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
{
"calibrated": false,
"calibrated": true,
"tolerance": 0.05,
"free_core": {
"recall@10": null,
"mrr": null
"recall@10": 0.8,
"mrr": 0.8
},
"pro_reranked": {
"recall@10": null,
"mrr": null
"recall@10": 0.85,
"mrr": 0.6583
},
"note": "Set calibrated=true and fill the numbers after the first real `python -m evals` run against a populated index. Until calibrated, the pytest gate skips rather than asserting against a fake baseline."
"note": "Calibrated 2026-06-12 against the OCI repo index (repo_id 78aa181e-..., PINECONE_INDEX_NAME=codeintel) over the 10-query ground-truth set. Free-tier is the CI regression baseline; the pytest gate asserts free recall@10 >= 0.8 - tolerance. Reranking raises recall@10 to 0.85 but lowers MRR to 0.658 on this set (pulls one more expected file into top-10 while demoting rank-1 hits) -- pro tier is not strictly better here. Known ranker misses: q06 (durable repo-state) and q10 (path-filtering). Re-calibrate after any search_v2 ranker/embedding change."
}
15 changes: 12 additions & 3 deletions backend/evals/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,21 @@ def load_baseline(path: Path = BASELINE_PATH) -> dict:
return json.loads(path.read_text())


def _dedupe_files_by_rank(results: List[dict]) -> List[Tuple[str, float]]:
def _to_repo_relative(file_path: str, repo_id: str) -> str:
"""The index stores paths as `repos/<repo_id>/<repo-root-relative>`; ground truth
is repo-root-relative (and stays that way so it's portable across repo_ids). Strip
the exact storage prefix so the two compare. Exact match on repo_id, so we never
over-trim a path that happens to start with `repos/`."""
prefix = f"repos/{repo_id}/"
return file_path[len(prefix):] if file_path.startswith(prefix) else file_path
Comment thread
DevanshuNEU marked this conversation as resolved.


def _dedupe_files_by_rank(results: List[dict], repo_id: str) -> List[Tuple[str, float]]:
"""search_v2 returns function-level hits; collapse to file-level, keeping the
best (first) rank per file. Returns [(file_path, score)] in rank order."""
seen: Dict[str, float] = {}
for r in results:
fp = r.get("file_path") or ""
fp = _to_repo_relative(r.get("file_path") or "", repo_id)
if fp and fp not in seen:
seen[fp] = float(r.get("score", 0.0))
return list(seen.items())
Expand Down Expand Up @@ -113,7 +122,7 @@ async def run_eval(reranking: bool, repo_id_default: str = DEFAULT_REPO_ID) -> D
# no-hit from a swallowed error here, so flag it loudly instead of hiding it.
empties.append(qid)

ranked = _dedupe_files_by_rank(results)
ranked = _dedupe_files_by_rank(results, repo_id)
ranked_paths = [fp for fp, _ in ranked]
run[qid] = {fp: score for fp, score in ranked} or {"__no_results__": 0.0}

Expand Down
Loading