|
| 1 | +"""Offline retrieval-quality eval over the live search_v2 ranker (OCI #312). |
| 2 | +
|
| 3 | +Deterministic by construction: a fixed query set + a fixed index + no result cache. |
| 4 | +The indexer singleton and ranx are imported INSIDE functions so this module never |
| 5 | +touches the backend startup path (oci known-bug #3). |
| 6 | +
|
| 7 | +Tiers (Cohere reranking is pro-only -- you pay for Cohere): |
| 8 | + reranking=False -> free tier (deterministic BM25 + vector core ranker, no Cohere) |
| 9 | + reranking=True -> pro tier (Cohere rerank; requires COHERE_API_KEY) |
| 10 | +""" |
| 11 | +import json |
| 12 | +import os |
| 13 | +from datetime import datetime, timezone |
| 14 | +from pathlib import Path |
| 15 | +from typing import Dict, List, Sequence, Tuple |
| 16 | + |
| 17 | +EVAL_DIR = Path(__file__).parent |
| 18 | +GROUND_TRUTH_PATH = EVAL_DIR / "ground_truth" / "queries.json" |
| 19 | +BASELINE_PATH = EVAL_DIR / "baseline.json" |
| 20 | +RESULTS_DIR = EVAL_DIR / "results" |
| 21 | +DEFAULT_REPO_ID = os.getenv("OCI_EVAL_REPO_ID", "78aa181e-2bbb-438b-97ee-9ffd494c4815") |
| 22 | +TOP_K = 10 |
| 23 | +K_VALUES: Sequence[int] = (5, 10) |
| 24 | + |
| 25 | + |
| 26 | +def load_ground_truth(path: Path = GROUND_TRUTH_PATH) -> List[dict]: |
| 27 | + return json.loads(path.read_text())["queries"] |
| 28 | + |
| 29 | + |
| 30 | +def load_baseline(path: Path = BASELINE_PATH) -> dict: |
| 31 | + return json.loads(path.read_text()) |
| 32 | + |
| 33 | + |
| 34 | +def _dedupe_files_by_rank(results: List[dict]) -> List[Tuple[str, float]]: |
| 35 | + """search_v2 returns function-level hits; collapse to file-level, keeping the |
| 36 | + best (first) rank per file. Returns [(file_path, score)] in rank order.""" |
| 37 | + seen: Dict[str, float] = {} |
| 38 | + for r in results: |
| 39 | + fp = r.get("file_path") or "" |
| 40 | + if fp and fp not in seen: |
| 41 | + seen[fp] = float(r.get("score", 0.0)) |
| 42 | + return list(seen.items()) |
| 43 | + |
| 44 | + |
| 45 | +async def _preflight(indexer, repo_id: str) -> bool: |
| 46 | + """Fail-closed corpus check: does the index actually have vectors for this repo? |
| 47 | +
|
| 48 | + Guards against measuring a stale/empty index (known-bug #4: Pinecone is eventual |
| 49 | + and may be unprovisioned). Reranking off so the probe is cheap and never needs |
| 50 | + Cohere. A missing index or a swallowed search error both surface as 0 hits -> skip. |
| 51 | + """ |
| 52 | + if getattr(indexer, "index", None) is None: |
| 53 | + return False |
| 54 | + try: |
| 55 | + probe = await indexer.search_v2(query="function", repo_id=repo_id, top_k=1, use_reranking=False) |
| 56 | + return len(probe) > 0 |
| 57 | + except Exception: |
| 58 | + return False |
| 59 | + |
| 60 | + |
| 61 | +async def run_eval(reranking: bool, repo_id_default: str = DEFAULT_REPO_ID) -> dict: |
| 62 | + """Run the full query set through search_v2 and compute metrics for one tier.""" |
| 63 | + from dependencies import indexer # isolated import (known-bug #3) |
| 64 | + |
| 65 | + from .metrics import compute_metrics |
| 66 | + |
| 67 | + if reranking and not os.getenv("COHERE_API_KEY"): |
| 68 | + raise RuntimeError( |
| 69 | + "Pro-tier (reranked) eval requested but COHERE_API_KEY is not set. " |
| 70 | + "Cohere reranking is pro-only; set the key or run free-tier (--free-only)." |
| 71 | + ) |
| 72 | + |
| 73 | + queries = load_ground_truth() |
| 74 | + qrels: Dict[str, Dict[str, int]] = {} |
| 75 | + run: Dict[str, Dict[str, float]] = {} |
| 76 | + per_query: List[dict] = [] |
| 77 | + empties: List[str] = [] |
| 78 | + skipped_repos = set() |
| 79 | + preflight_cache: Dict[str, bool] = {} |
| 80 | + |
| 81 | + for q in queries: |
| 82 | + qid = str(q["query_id"]) |
| 83 | + repo_id = q.get("repo_id") or repo_id_default |
| 84 | + |
| 85 | + if repo_id not in preflight_cache: |
| 86 | + preflight_cache[repo_id] = await _preflight(indexer, repo_id) |
| 87 | + if not preflight_cache[repo_id]: |
| 88 | + skipped_repos.add(repo_id) |
| 89 | + print( |
| 90 | + f"[SKIP-LOUD] repo {repo_id} has no vectors in the index; skipping " |
| 91 | + f"query {qid} (measuring it would corrupt recall). Index the repo first." |
| 92 | + ) |
| 93 | + continue |
| 94 | + |
| 95 | + expected = list(q["expected_files"]) |
| 96 | + qrels[qid] = {fp: 1 for fp in expected} |
| 97 | + |
| 98 | + results = await indexer.search_v2( |
| 99 | + query=q["query"], repo_id=repo_id, top_k=TOP_K, use_reranking=reranking |
| 100 | + ) |
| 101 | + if not results: |
| 102 | + # FM-3: search_v2 swallows exceptions and returns []. We cannot tell a true |
| 103 | + # no-hit from a swallowed error here, so flag it loudly instead of hiding it. |
| 104 | + empties.append(qid) |
| 105 | + |
| 106 | + ranked = _dedupe_files_by_rank(results) |
| 107 | + ranked_paths = [fp for fp, _ in ranked] |
| 108 | + run[qid] = {fp: score for fp, score in ranked} or {"__no_results__": 0.0} |
| 109 | + |
| 110 | + expected_ranks = { |
| 111 | + fp: (ranked_paths.index(fp) + 1 if fp in ranked_paths else None) for fp in expected |
| 112 | + } |
| 113 | + per_query.append( |
| 114 | + { |
| 115 | + "query_id": qid, |
| 116 | + "query": q["query"], |
| 117 | + "repo_id": repo_id, |
| 118 | + "expected_files": expected, |
| 119 | + "returned_files": ranked_paths, |
| 120 | + "expected_ranks": expected_ranks, |
| 121 | + "empty_result": qid in empties, |
| 122 | + } |
| 123 | + ) |
| 124 | + |
| 125 | + metrics = compute_metrics(qrels, run, K_VALUES) if qrels else {} |
| 126 | + |
| 127 | + tier = "pro_reranked" if reranking else "free_core" |
| 128 | + ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ") |
| 129 | + out = { |
| 130 | + "timestamp": ts, |
| 131 | + "tier": tier, |
| 132 | + "reranking": reranking, |
| 133 | + "n_queries_scored": len(qrels), |
| 134 | + "n_empty_results": len(empties), |
| 135 | + "empty_query_ids": empties, |
| 136 | + "skipped_repos": sorted(skipped_repos), |
| 137 | + "metrics": metrics, |
| 138 | + "per_query": per_query, |
| 139 | + } |
| 140 | + RESULTS_DIR.mkdir(exist_ok=True) |
| 141 | + (RESULTS_DIR / f"eval_{tier}_{ts}.json").write_text(json.dumps(out, indent=2)) |
| 142 | + return out |
0 commit comments