Skip to content

Commit be2ed91

Browse files
committed
feat: retrieval-quality eval harness core (#312)
Offline, deterministic measurement of the live search_v2 ranker: recall@k, MRR, precision@k over a labeled query set. runner calls indexer.search_v2 in-process (cache off) so the number is repeatable; metrics are computed by ranx (bought, not hand-rolled -- recall/MRR math is a subtle-bug factory and a wrong metric is worse than no eval). Chose in-process over live-HTTP: the route cache makes HTTP nondeterministic, and recall/MRR are properties of the ranker, not the transport. Cohere reranking is pro-only, so the CLI records both tiers (free=no-rerank baseline, pro=Cohere) and prints the delta = what reranking is worth. ranx and the indexer singleton are imported inside functions, never at module top, so this never reaches the backend startup path (known-bug #3).
1 parent ca451f0 commit be2ed91

5 files changed

Lines changed: 255 additions & 0 deletions

File tree

backend/evals/__init__.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
"""Retrieval-quality eval harness (OCI #312).
2+
3+
Offline, deterministic measurement of the live search_v2 ranker: recall@k, MRR,
4+
precision@k over a human-labeled query set. Run via `python -m evals` from backend/.
5+
6+
Import isolation: this package and its deps (ranx) are imported only when an eval
7+
runs, never on the backend startup path. See oci known-bug #3.
8+
"""

backend/evals/__main__.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
"""CLI entry for the retrieval-quality eval (OCI #312).
2+
3+
Usage (from backend/):
4+
python -m evals # record BOTH numbers: free tier + pro tier
5+
python -m evals --free-only # free tier only (no Cohere, deterministic)
6+
python -m evals --pro-only # pro tier only (Cohere rerank; needs COHERE_API_KEY)
7+
"""
8+
import argparse
9+
import asyncio
10+
11+
from .runner import run_eval
12+
13+
14+
def _print_table(label: str, out: dict) -> None:
15+
print(f"\n=== {label} (tier={out['tier']}, reranking={out['reranking']}) ===")
16+
print(
17+
f"queries scored: {out['n_queries_scored']} "
18+
f"empty/ambiguous: {out['n_empty_results']} "
19+
f"skipped repos: {len(out['skipped_repos'])}"
20+
)
21+
if not out["metrics"]:
22+
print(" no metrics: no queries scored (index not populated?). See SKIP-LOUD lines above.")
23+
return
24+
for name, val in out["metrics"].items():
25+
print(f" {name:<14} {val:.4f}")
26+
if out["empty_query_ids"]:
27+
print(
28+
f" NOTE: empty results for queries {out['empty_query_ids']} "
29+
f"(counted as misses; investigate index/errors before trusting these numbers)."
30+
)
31+
32+
33+
def main() -> None:
34+
ap = argparse.ArgumentParser(prog="python -m evals", description="OCI retrieval-quality eval (#312)")
35+
g = ap.add_mutually_exclusive_group()
36+
g.add_argument("--free-only", action="store_true", help="free-tier only (no reranking)")
37+
g.add_argument("--pro-only", action="store_true", help="pro-tier only (Cohere reranking)")
38+
args = ap.parse_args()
39+
40+
if args.free_only:
41+
_print_table("FREE TIER", asyncio.run(run_eval(reranking=False)))
42+
return
43+
if args.pro_only:
44+
_print_table("PRO TIER", asyncio.run(run_eval(reranking=True)))
45+
return
46+
47+
# default: record both numbers so we can see exactly what reranking is worth
48+
free = asyncio.run(run_eval(reranking=False))
49+
_print_table("FREE TIER (core ranker)", free)
50+
try:
51+
pro = asyncio.run(run_eval(reranking=True))
52+
_print_table("PRO TIER (Cohere rerank)", pro)
53+
f10, p10 = free["metrics"].get("recall@10"), pro["metrics"].get("recall@10")
54+
if f10 is not None and p10 is not None:
55+
print(f"\nReranking delta recall@10: {p10 - f10:+.4f} (what the pro tier buys)")
56+
except RuntimeError as e:
57+
print(f"\n[PRO TIER SKIPPED] {e}")
58+
59+
60+
if __name__ == "__main__":
61+
main()

backend/evals/metrics.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
"""Retrieval metrics, computed by ranx.
2+
3+
ranx is imported lazily inside the function: it is a heavy optional dep and must
4+
never be on the backend startup import path (oci known-bug #3). We buy the metric
5+
math instead of hand-rolling it -- recall@k / MRR are a classic subtle-bug factory,
6+
and a buggy metric makes the eval confidently wrong (worse than no eval).
7+
"""
8+
from typing import Dict, Sequence
9+
10+
11+
def compute_metrics(
12+
qrels: Dict[str, Dict[str, int]],
13+
run: Dict[str, Dict[str, float]],
14+
k_values: Sequence[int] = (5, 10),
15+
) -> Dict[str, float]:
16+
"""Compute recall@k, precision@k for each k, plus MRR (over the returned set).
17+
18+
qrels: {query_id: {doc_id: relevance>=1}} -- the human-labeled answer key.
19+
run: {query_id: {doc_id: score}} -- what search returned.
20+
"""
21+
try:
22+
from ranx import Qrels, Run, evaluate
23+
except ModuleNotFoundError as e:
24+
raise ModuleNotFoundError(
25+
"ranx is required for eval metrics. Install dev deps: "
26+
"pip install -r backend/requirements-dev.txt"
27+
) from e
28+
29+
metric_names = []
30+
for k in k_values:
31+
metric_names.append(f"recall@{k}")
32+
metric_names.append(f"precision@{k}")
33+
metric_names.append("mrr") # results are already capped at top_k, so this is MRR@top_k
34+
35+
scores = evaluate(Qrels(qrels), Run(run), metric_names)
36+
# ranx returns a bare float when a single metric is requested; a dict otherwise.
37+
if isinstance(scores, (int, float)):
38+
scores = {metric_names[0]: scores}
39+
return {m: float(scores[m]) for m in metric_names}

backend/evals/runner.py

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
"""Offline retrieval-quality eval over the live search_v2 ranker (OCI #312).
2+
3+
Deterministic by construction: a fixed query set + a fixed index + no result cache.
4+
The indexer singleton and ranx are imported INSIDE functions so this module never
5+
touches the backend startup path (oci known-bug #3).
6+
7+
Tiers (Cohere reranking is pro-only -- you pay for Cohere):
8+
reranking=False -> free tier (deterministic BM25 + vector core ranker, no Cohere)
9+
reranking=True -> pro tier (Cohere rerank; requires COHERE_API_KEY)
10+
"""
11+
import json
12+
import os
13+
from datetime import datetime, timezone
14+
from pathlib import Path
15+
from typing import Dict, List, Sequence, Tuple
16+
17+
EVAL_DIR = Path(__file__).parent
18+
GROUND_TRUTH_PATH = EVAL_DIR / "ground_truth" / "queries.json"
19+
BASELINE_PATH = EVAL_DIR / "baseline.json"
20+
RESULTS_DIR = EVAL_DIR / "results"
21+
DEFAULT_REPO_ID = os.getenv("OCI_EVAL_REPO_ID", "78aa181e-2bbb-438b-97ee-9ffd494c4815")
22+
TOP_K = 10
23+
K_VALUES: Sequence[int] = (5, 10)
24+
25+
26+
def load_ground_truth(path: Path = GROUND_TRUTH_PATH) -> List[dict]:
27+
return json.loads(path.read_text())["queries"]
28+
29+
30+
def load_baseline(path: Path = BASELINE_PATH) -> dict:
31+
return json.loads(path.read_text())
32+
33+
34+
def _dedupe_files_by_rank(results: List[dict]) -> List[Tuple[str, float]]:
35+
"""search_v2 returns function-level hits; collapse to file-level, keeping the
36+
best (first) rank per file. Returns [(file_path, score)] in rank order."""
37+
seen: Dict[str, float] = {}
38+
for r in results:
39+
fp = r.get("file_path") or ""
40+
if fp and fp not in seen:
41+
seen[fp] = float(r.get("score", 0.0))
42+
return list(seen.items())
43+
44+
45+
async def _preflight(indexer, repo_id: str) -> bool:
46+
"""Fail-closed corpus check: does the index actually have vectors for this repo?
47+
48+
Guards against measuring a stale/empty index (known-bug #4: Pinecone is eventual
49+
and may be unprovisioned). Reranking off so the probe is cheap and never needs
50+
Cohere. A missing index or a swallowed search error both surface as 0 hits -> skip.
51+
"""
52+
if getattr(indexer, "index", None) is None:
53+
return False
54+
try:
55+
probe = await indexer.search_v2(query="function", repo_id=repo_id, top_k=1, use_reranking=False)
56+
return len(probe) > 0
57+
except Exception:
58+
return False
59+
60+
61+
async def run_eval(reranking: bool, repo_id_default: str = DEFAULT_REPO_ID) -> dict:
62+
"""Run the full query set through search_v2 and compute metrics for one tier."""
63+
from dependencies import indexer # isolated import (known-bug #3)
64+
65+
from .metrics import compute_metrics
66+
67+
if reranking and not os.getenv("COHERE_API_KEY"):
68+
raise RuntimeError(
69+
"Pro-tier (reranked) eval requested but COHERE_API_KEY is not set. "
70+
"Cohere reranking is pro-only; set the key or run free-tier (--free-only)."
71+
)
72+
73+
queries = load_ground_truth()
74+
qrels: Dict[str, Dict[str, int]] = {}
75+
run: Dict[str, Dict[str, float]] = {}
76+
per_query: List[dict] = []
77+
empties: List[str] = []
78+
skipped_repos = set()
79+
preflight_cache: Dict[str, bool] = {}
80+
81+
for q in queries:
82+
qid = str(q["query_id"])
83+
repo_id = q.get("repo_id") or repo_id_default
84+
85+
if repo_id not in preflight_cache:
86+
preflight_cache[repo_id] = await _preflight(indexer, repo_id)
87+
if not preflight_cache[repo_id]:
88+
skipped_repos.add(repo_id)
89+
print(
90+
f"[SKIP-LOUD] repo {repo_id} has no vectors in the index; skipping "
91+
f"query {qid} (measuring it would corrupt recall). Index the repo first."
92+
)
93+
continue
94+
95+
expected = list(q["expected_files"])
96+
qrels[qid] = {fp: 1 for fp in expected}
97+
98+
results = await indexer.search_v2(
99+
query=q["query"], repo_id=repo_id, top_k=TOP_K, use_reranking=reranking
100+
)
101+
if not results:
102+
# FM-3: search_v2 swallows exceptions and returns []. We cannot tell a true
103+
# no-hit from a swallowed error here, so flag it loudly instead of hiding it.
104+
empties.append(qid)
105+
106+
ranked = _dedupe_files_by_rank(results)
107+
ranked_paths = [fp for fp, _ in ranked]
108+
run[qid] = {fp: score for fp, score in ranked} or {"__no_results__": 0.0}
109+
110+
expected_ranks = {
111+
fp: (ranked_paths.index(fp) + 1 if fp in ranked_paths else None) for fp in expected
112+
}
113+
per_query.append(
114+
{
115+
"query_id": qid,
116+
"query": q["query"],
117+
"repo_id": repo_id,
118+
"expected_files": expected,
119+
"returned_files": ranked_paths,
120+
"expected_ranks": expected_ranks,
121+
"empty_result": qid in empties,
122+
}
123+
)
124+
125+
metrics = compute_metrics(qrels, run, K_VALUES) if qrels else {}
126+
127+
tier = "pro_reranked" if reranking else "free_core"
128+
ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
129+
out = {
130+
"timestamp": ts,
131+
"tier": tier,
132+
"reranking": reranking,
133+
"n_queries_scored": len(qrels),
134+
"n_empty_results": len(empties),
135+
"empty_query_ids": empties,
136+
"skipped_repos": sorted(skipped_repos),
137+
"metrics": metrics,
138+
"per_query": per_query,
139+
}
140+
RESULTS_DIR.mkdir(exist_ok=True)
141+
(RESULTS_DIR / f"eval_{tier}_{ts}.json").write_text(json.dumps(out, indent=2))
142+
return out

backend/requirements-dev.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# Dev / eval-only dependencies. NOT installed in production.
2+
# Kept separate from requirements.txt so heavy eval deps never reach the backend
3+
# startup path (oci known-bug #3: optional imports must not crash startup).
4+
# Install with: pip install -r backend/requirements-dev.txt
5+
ranx>=0.3.20

0 commit comments

Comments
 (0)