feat: retrieval-quality eval harness core (#312)

DevanshuNEU · DevanshuNEU · commit be2ed914e160 · 2026-06-10T23:47:25.000-04:00
Offline, deterministic measurement of the live search_v2 ranker: recall@k, MRR, precision@k over a labeled query set. runner calls indexer.search_v2 in-process (cache off) so the number is repeatable; metrics are computed by ranx (bought, not hand-rolled -- recall/MRR math is a subtle-bug factory and a wrong metric is worse than no eval). Chose in-process over live-HTTP: the route cache makes HTTP nondeterministic, and recall/MRR are properties of the ranker, not the transport. Cohere reranking is pro-only, so the CLI records both tiers (free=no-rerank baseline, pro=Cohere) and prints the delta = what reranking is worth. ranx and the indexer singleton are imported inside functions, never at module top, so this never reaches the backend startup path (known-bug #3).
diff --git a/backend/evals/__init__.py b/backend/evals/__init__.py
@@ -0,0 +1,8 @@
+"""Retrieval-quality eval harness (OCI #312).
+
+Offline, deterministic measurement of the live search_v2 ranker: recall@k, MRR,
+precision@k over a human-labeled query set. Run via `python -m evals` from backend/.
+
+Import isolation: this package and its deps (ranx) are imported only when an eval
+runs, never on the backend startup path. See oci known-bug #3.
+"""
diff --git a/backend/evals/__main__.py b/backend/evals/__main__.py
@@ -0,0 +1,61 @@
+"""CLI entry for the retrieval-quality eval (OCI #312).
+
+Usage (from backend/):
+    python -m evals               # record BOTH numbers: free tier + pro tier
+    python -m evals --free-only   # free tier only (no Cohere, deterministic)
+    python -m evals --pro-only    # pro tier only (Cohere rerank; needs COHERE_API_KEY)
+"""
+import argparse
+import asyncio
+
+from .runner import run_eval
+
+
+def _print_table(label: str, out: dict) -> None:
+    print(f"\n=== {label}  (tier={out['tier']}, reranking={out['reranking']}) ===")
+    print(
+        f"queries scored: {out['n_queries_scored']}   "
+        f"empty/ambiguous: {out['n_empty_results']}   "
+        f"skipped repos: {len(out['skipped_repos'])}"
+    )
+    if not out["metrics"]:
+        print("  no metrics: no queries scored (index not populated?). See SKIP-LOUD lines above.")
+        return
+    for name, val in out["metrics"].items():
+        print(f"  {name:<14} {val:.4f}")
+    if out["empty_query_ids"]:
+        print(
+            f"  NOTE: empty results for queries {out['empty_query_ids']} "
+            f"(counted as misses; investigate index/errors before trusting these numbers)."
+        )
+
+
+def main() -> None:
+    ap = argparse.ArgumentParser(prog="python -m evals", description="OCI retrieval-quality eval (#312)")
+    g = ap.add_mutually_exclusive_group()
+    g.add_argument("--free-only", action="store_true", help="free-tier only (no reranking)")
+    g.add_argument("--pro-only", action="store_true", help="pro-tier only (Cohere reranking)")
+    args = ap.parse_args()
+
+    if args.free_only:
+        _print_table("FREE TIER", asyncio.run(run_eval(reranking=False)))
+        return
+    if args.pro_only:
+        _print_table("PRO TIER", asyncio.run(run_eval(reranking=True)))
+        return
+
+    # default: record both numbers so we can see exactly what reranking is worth
+    free = asyncio.run(run_eval(reranking=False))
+    _print_table("FREE TIER (core ranker)", free)
+    try:
+        pro = asyncio.run(run_eval(reranking=True))
+        _print_table("PRO TIER (Cohere rerank)", pro)
+        f10, p10 = free["metrics"].get("recall@10"), pro["metrics"].get("recall@10")
+        if f10 is not None and p10 is not None:
+            print(f"\nReranking delta recall@10: {p10 - f10:+.4f}  (what the pro tier buys)")
+    except RuntimeError as e:
+        print(f"\n[PRO TIER SKIPPED] {e}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/backend/evals/metrics.py b/backend/evals/metrics.py
@@ -0,0 +1,39 @@
+"""Retrieval metrics, computed by ranx.
+
+ranx is imported lazily inside the function: it is a heavy optional dep and must
+never be on the backend startup import path (oci known-bug #3). We buy the metric
+math instead of hand-rolling it -- recall@k / MRR are a classic subtle-bug factory,
+and a buggy metric makes the eval confidently wrong (worse than no eval).
+"""
+from typing import Dict, Sequence
+
+
+def compute_metrics(
+    qrels: Dict[str, Dict[str, int]],
+    run: Dict[str, Dict[str, float]],
+    k_values: Sequence[int] = (5, 10),
+) -> Dict[str, float]:
+    """Compute recall@k, precision@k for each k, plus MRR (over the returned set).
+
+    qrels: {query_id: {doc_id: relevance>=1}} -- the human-labeled answer key.
+    run:   {query_id: {doc_id: score}}        -- what search returned.
+    """
+    try:
+        from ranx import Qrels, Run, evaluate
+    except ModuleNotFoundError as e:
+        raise ModuleNotFoundError(
+            "ranx is required for eval metrics. Install dev deps: "
+            "pip install -r backend/requirements-dev.txt"
+        ) from e
+
+    metric_names = []
+    for k in k_values:
+        metric_names.append(f"recall@{k}")
+        metric_names.append(f"precision@{k}")
+    metric_names.append("mrr")  # results are already capped at top_k, so this is MRR@top_k
+
+    scores = evaluate(Qrels(qrels), Run(run), metric_names)
+    # ranx returns a bare float when a single metric is requested; a dict otherwise.
+    if isinstance(scores, (int, float)):
+        scores = {metric_names[0]: scores}
+    return {m: float(scores[m]) for m in metric_names}
diff --git a/backend/evals/runner.py b/backend/evals/runner.py
@@ -0,0 +1,142 @@
+"""Offline retrieval-quality eval over the live search_v2 ranker (OCI #312).
+
+Deterministic by construction: a fixed query set + a fixed index + no result cache.
+The indexer singleton and ranx are imported INSIDE functions so this module never
+touches the backend startup path (oci known-bug #3).
+
+Tiers (Cohere reranking is pro-only -- you pay for Cohere):
+  reranking=False -> free tier  (deterministic BM25 + vector core ranker, no Cohere)
+  reranking=True  -> pro tier   (Cohere rerank; requires COHERE_API_KEY)
+"""
+import json
+import os
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Dict, List, Sequence, Tuple
+
+EVAL_DIR = Path(__file__).parent
+GROUND_TRUTH_PATH = EVAL_DIR / "ground_truth" / "queries.json"
+BASELINE_PATH = EVAL_DIR / "baseline.json"
+RESULTS_DIR = EVAL_DIR / "results"
+DEFAULT_REPO_ID = os.getenv("OCI_EVAL_REPO_ID", "78aa181e-2bbb-438b-97ee-9ffd494c4815")
+TOP_K = 10
+K_VALUES: Sequence[int] = (5, 10)
+
+
+def load_ground_truth(path: Path = GROUND_TRUTH_PATH) -> List[dict]:
+    return json.loads(path.read_text())["queries"]
+
+
+def load_baseline(path: Path = BASELINE_PATH) -> dict:
+    return json.loads(path.read_text())
+
+
+def _dedupe_files_by_rank(results: List[dict]) -> List[Tuple[str, float]]:
+    """search_v2 returns function-level hits; collapse to file-level, keeping the
+    best (first) rank per file. Returns [(file_path, score)] in rank order."""
+    seen: Dict[str, float] = {}
+    for r in results:
+        fp = r.get("file_path") or ""
+        if fp and fp not in seen:
+            seen[fp] = float(r.get("score", 0.0))
+    return list(seen.items())
+
+
+async def _preflight(indexer, repo_id: str) -> bool:
+    """Fail-closed corpus check: does the index actually have vectors for this repo?
+
+    Guards against measuring a stale/empty index (known-bug #4: Pinecone is eventual
+    and may be unprovisioned). Reranking off so the probe is cheap and never needs
+    Cohere. A missing index or a swallowed search error both surface as 0 hits -> skip.
+    """
+    if getattr(indexer, "index", None) is None:
+        return False
+    try:
+        probe = await indexer.search_v2(query="function", repo_id=repo_id, top_k=1, use_reranking=False)
+        return len(probe) > 0
+    except Exception:
+        return False
+
+
+async def run_eval(reranking: bool, repo_id_default: str = DEFAULT_REPO_ID) -> dict:
+    """Run the full query set through search_v2 and compute metrics for one tier."""
+    from dependencies import indexer  # isolated import (known-bug #3)
+
+    from .metrics import compute_metrics
+
+    if reranking and not os.getenv("COHERE_API_KEY"):
+        raise RuntimeError(
+            "Pro-tier (reranked) eval requested but COHERE_API_KEY is not set. "
+            "Cohere reranking is pro-only; set the key or run free-tier (--free-only)."
+        )
+
+    queries = load_ground_truth()
+    qrels: Dict[str, Dict[str, int]] = {}
+    run: Dict[str, Dict[str, float]] = {}
+    per_query: List[dict] = []
+    empties: List[str] = []
+    skipped_repos = set()
+    preflight_cache: Dict[str, bool] = {}
+
+    for q in queries:
+        qid = str(q["query_id"])
+        repo_id = q.get("repo_id") or repo_id_default
+
+        if repo_id not in preflight_cache:
+            preflight_cache[repo_id] = await _preflight(indexer, repo_id)
+        if not preflight_cache[repo_id]:
+            skipped_repos.add(repo_id)
+            print(
+                f"[SKIP-LOUD] repo {repo_id} has no vectors in the index; skipping "
+                f"query {qid} (measuring it would corrupt recall). Index the repo first."
+            )
+            continue
+
+        expected = list(q["expected_files"])
+        qrels[qid] = {fp: 1 for fp in expected}
+
+        results = await indexer.search_v2(
+            query=q["query"], repo_id=repo_id, top_k=TOP_K, use_reranking=reranking
+        )
+        if not results:
+            # FM-3: search_v2 swallows exceptions and returns []. We cannot tell a true
+            # no-hit from a swallowed error here, so flag it loudly instead of hiding it.
+            empties.append(qid)
+
+        ranked = _dedupe_files_by_rank(results)
+        ranked_paths = [fp for fp, _ in ranked]
+        run[qid] = {fp: score for fp, score in ranked} or {"__no_results__": 0.0}
+
+        expected_ranks = {
+            fp: (ranked_paths.index(fp) + 1 if fp in ranked_paths else None) for fp in expected
+        }
+        per_query.append(
+            {
+                "query_id": qid,
+                "query": q["query"],
+                "repo_id": repo_id,
+                "expected_files": expected,
+                "returned_files": ranked_paths,
+                "expected_ranks": expected_ranks,
+                "empty_result": qid in empties,
+            }
+        )
+
+    metrics = compute_metrics(qrels, run, K_VALUES) if qrels else {}
+
+    tier = "pro_reranked" if reranking else "free_core"
+    ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
+    out = {
+        "timestamp": ts,
+        "tier": tier,
+        "reranking": reranking,
+        "n_queries_scored": len(qrels),
+        "n_empty_results": len(empties),
+        "empty_query_ids": empties,
+        "skipped_repos": sorted(skipped_repos),
+        "metrics": metrics,
+        "per_query": per_query,
+    }
+    RESULTS_DIR.mkdir(exist_ok=True)
+    (RESULTS_DIR / f"eval_{tier}_{ts}.json").write_text(json.dumps(out, indent=2))
+    return out
diff --git a/backend/requirements-dev.txt b/backend/requirements-dev.txt
@@ -0,0 +1,5 @@
+# Dev / eval-only dependencies. NOT installed in production.
+# Kept separate from requirements.txt so heavy eval deps never reach the backend
+# startup path (oci known-bug #3: optional imports must not crash startup).
+# Install with: pip install -r backend/requirements-dev.txt
+ranx>=0.3.20