diff --git a/backend/evals/README.md b/backend/evals/README.md new file mode 100644 index 0000000..97ca290 --- /dev/null +++ b/backend/evals/README.md @@ -0,0 +1,78 @@ +# Retrieval-quality eval harness (#312) + +Offline, deterministic measurement of OCI's live `search_v2` ranker. Answers one +question with a number: **for a set of known queries, does search return the right +files, ranked high enough to matter?** Metrics: `recall@5`, `recall@10`, `precision@k`, +`MRR`. This is the internal regression instrument; it is not the public benchmark. + +## Why it exists + +You cannot improve, defend, or sell what you cannot measure. Before this, search +quality was unquantified. This harness is the speedometer: run it before and after +any change to search (a new reranker, an embedding-model swap, the v2 to v3 cutover) +and see whether the change helped or hurt. + +## Tiers (Cohere reranking is pro-only) + +Cohere costs money, so reranking is a **pro-tier** feature. The harness records both: + +- **Free tier** (`--free-only`): no reranking. The deterministic BM25 + vector core + ranker -- what most users and agents actually get. This is the CI/regression baseline. +- **Pro tier** (`--pro-only`): Cohere reranking. Requires `COHERE_API_KEY`; without it + the pro run is skipped (loudly), never silently duplicated. + +Default (`python -m evals`) runs both and prints the delta, so you can see exactly +what reranking is worth. + +## Run it + +From `backend/` (needs `OPENAI_API_KEY`, `PINECONE_API_KEY`, and a populated index): + +```bash +pip install -r requirements-dev.txt # one-time: installs ranx +export OCI_EVAL_REPO_ID= # defaults to OCI's own repo id +python -m evals # both tiers +python -m evals --free-only # free tier only (deterministic, no Cohere) +``` + +Each run prints a table and writes a timestamped JSON to `results/` (git-ignored) with +a **per-query breakdown** -- the rank of every expected file -- so a regression is +diagnosable, not just detectable. + +## First run is a calibration step + +`expected_files` are repo-root-relative and must match the `file_path` strings the +index stores. If the first run shows recall near 0 across *all* queries, that is almost +certainly a path-format mismatch, not a bad ranker -- compare `expected_files` against +`returned_files` in the per-query breakdown and adjust the labels. Then set the baseline. + +## Add a query (the rule that keeps the number honest) + +Edit `ground_truth/queries.json`. Two non-negotiables: + +1. **Write it agent-shaped.** Phrase it the way an agent asks (`"where is the JWT + validated before a request reaches a route"`), not keyword-shaped (`"auth"`). +2. **Label it blind.** Decide `expected_files` by *reading the repo*, never by looking + at what search returns. Grading the test against the system's own output measures + self-agreement, not correctness (FM-2 in the ADR). + +## Calibrate the regression gate + +`backend/evals/test_retrieval_quality.py` is a pytest gate that asserts +`recall@10 >= baseline - tolerance`. It lives here (not in `backend/tests/`) because +that suite globally mocks Pinecone + OpenAI, which would force recall to 0. Run it with +`pytest evals/ -v`. It skips until `baseline.json` has `calibrated: true`: + +1. Run `python -m evals`, confirm the numbers are real (calibration above). +2. Put the free-tier `recall@10` and `mrr` into `baseline.json`, set `calibrated: true`. +3. From then on, `pytest evals/ -v` fails any change that regresses recall beyond tolerance. + +## Known limitations (v0.1) + +- **Local, not network-free CI.** Runs against the live index, so it needs real creds. + Committing query embeddings + an index snapshot for network-free CI is a follow-up. +- **Pre-flight checks "repo has vectors," not SHA-equality.** `repo_sha` is recorded as + documented intent; strict snapshot-pinning is a follow-up. +- **Empty results are flagged, not perfectly classified.** `search_v2` swallows errors + and returns `[]`, so a true no-hit and a swallowed error look the same here; both are + counted as misses and surfaced in `empty_query_ids` for investigation (FM-3). diff --git a/backend/evals/__init__.py b/backend/evals/__init__.py new file mode 100644 index 0000000..85c4382 --- /dev/null +++ b/backend/evals/__init__.py @@ -0,0 +1,8 @@ +"""Retrieval-quality eval harness (OCI #312). + +Offline, deterministic measurement of the live search_v2 ranker: recall@k, MRR, +precision@k over a human-labeled query set. Run via `python -m evals` from backend/. + +Import isolation: this package and its deps (ranx) are imported only when an eval +runs, never on the backend startup path. See oci known-bug #3. +""" diff --git a/backend/evals/__main__.py b/backend/evals/__main__.py new file mode 100644 index 0000000..a5d0476 --- /dev/null +++ b/backend/evals/__main__.py @@ -0,0 +1,63 @@ +"""CLI entry for the retrieval-quality eval (OCI #312). + +Usage (from backend/): + python -m evals # record BOTH numbers: free tier + pro tier + python -m evals --free-only # free tier only (no Cohere, deterministic) + python -m evals --pro-only # pro tier only (Cohere rerank; needs COHERE_API_KEY) +""" +import argparse +import asyncio +from typing import Any, Dict + +from .runner import run_eval + + +# out is the heterogeneous result dict from run_eval; Any is intentional +def _print_table(label: str, out: Dict[str, Any]) -> None: + print(f"\n=== {label} (tier={out['tier']}, reranking={out['reranking']}) ===") + print( + f"queries scored: {out['n_queries_scored']} " + f"empty/ambiguous: {out['n_empty_results']} " + f"skipped repos: {len(out['skipped_repos'])}" + ) + if not out["metrics"]: + print(" no metrics: no queries scored (index not populated?). See SKIP-LOUD lines above.") + return + for name, val in out["metrics"].items(): + print(f" {name:<14} {val:.4f}") + if out["empty_query_ids"]: + print( + f" NOTE: empty results for queries {out['empty_query_ids']} " + f"(counted as misses; investigate index/errors before trusting these numbers)." + ) + + +def main() -> None: + ap = argparse.ArgumentParser(prog="python -m evals", description="OCI retrieval-quality eval (#312)") + g = ap.add_mutually_exclusive_group() + g.add_argument("--free-only", action="store_true", help="free-tier only (no reranking)") + g.add_argument("--pro-only", action="store_true", help="pro-tier only (Cohere reranking)") + args = ap.parse_args() + + if args.free_only: + _print_table("FREE TIER", asyncio.run(run_eval(reranking=False))) + return + if args.pro_only: + _print_table("PRO TIER", asyncio.run(run_eval(reranking=True))) + return + + # default: record both numbers so we can see exactly what reranking is worth + free = asyncio.run(run_eval(reranking=False)) + _print_table("FREE TIER (core ranker)", free) + try: + pro = asyncio.run(run_eval(reranking=True)) + _print_table("PRO TIER (Cohere rerank)", pro) + f10, p10 = free["metrics"].get("recall@10"), pro["metrics"].get("recall@10") + if f10 is not None and p10 is not None: + print(f"\nReranking delta recall@10: {p10 - f10:+.4f} (what the pro tier buys)") + except RuntimeError as e: + print(f"\n[PRO TIER SKIPPED] {e}") + + +if __name__ == "__main__": + main() diff --git a/backend/evals/baseline.json b/backend/evals/baseline.json new file mode 100644 index 0000000..83e41df --- /dev/null +++ b/backend/evals/baseline.json @@ -0,0 +1,13 @@ +{ + "calibrated": false, + "tolerance": 0.05, + "free_core": { + "recall@10": null, + "mrr": null + }, + "pro_reranked": { + "recall@10": null, + "mrr": null + }, + "note": "Set calibrated=true and fill the numbers after the first real `python -m evals` run against a populated index. Until calibrated, the pytest gate skips rather than asserting against a fake baseline." +} diff --git a/backend/evals/ground_truth/queries.json b/backend/evals/ground_truth/queries.json new file mode 100644 index 0000000..4971f7f --- /dev/null +++ b/backend/evals/ground_truth/queries.json @@ -0,0 +1,65 @@ +{ + "_about": "Human-labeled retrieval ground truth for OCI #312. Queries are agent-shaped (the way Claude Code phrases a search_code call), NOT keyword-shaped. expected_files are labeled BLIND -- decided by reading the repo source, never by looking at what search returns (see FM-2 in the ADR). Paths are repo-root-relative and must match the file_path strings the index stores; the first run's per-query breakdown is the calibration step. repo_id defaults to OCI's own indexed repo (env OCI_EVAL_REPO_ID overrides per run).", + "queries": [ + { + "query_id": "q01", + "query": "where is the JWT validated before a request reaches a route", + "expected_files": ["backend/middleware/auth.py"], + "notes": "auth middleware; JWT local decode + API-key fallback" + }, + { + "query_id": "q02", + "query": "how does hybrid search combine BM25 keyword scores with semantic vector scores", + "expected_files": ["backend/services/search_v2/hybrid_searcher.py"], + "notes": "RRF fusion of BM25 + semantic candidates" + }, + { + "query_id": "q03", + "query": "where is Cohere reranking applied to the search candidates", + "expected_files": ["backend/services/search_v2/hybrid_searcher.py"], + "notes": "rerank step gated on COHERE_API_KEY (pro tier)" + }, + { + "query_id": "q04", + "query": "how are code embeddings created in batches for indexing", + "expected_files": ["backend/services/indexer_optimized.py"], + "notes": "_create_embeddings_batch, EMBEDDING_BATCH_SIZE" + }, + { + "query_id": "q05", + "query": "which API route handles the version 2 search request", + "expected_files": ["backend/routes/search_v2.py"], + "notes": "POST /search/v2 route; the path the MCP search_code tool hits" + }, + { + "query_id": "q06", + "query": "how is repository cloning made durable and recovered when an indexing job gets stuck", + "expected_files": ["backend/services/repo_manager.py", "backend/services/supabase_service.py"], + "notes": "ensure_clone chokepoint + reset_stuck_indexing_jobs (PR #316)" + }, + { + "query_id": "q07", + "query": "where are import dependency graphs extracted from the parsed AST", + "expected_files": ["backend/services/dependency_analyzer.py"], + "notes": "tree-sitter import graph extraction, singleton service" + }, + { + "query_id": "q08", + "query": "how does the MCP server forward a search_code tool call to the backend API", + "expected_files": ["mcp-server/handlers.py"], + "notes": "_handle_search posts to /search/v2" + }, + { + "query_id": "q09", + "query": "where is the codebase DNA and architectural pattern detection implemented", + "expected_files": ["backend/services/dna_extractor.py"], + "notes": "DNAExtractor.extract, team-rules detection" + }, + { + "query_id": "q10", + "query": "how does file path filtering avoid matching adjacent sibling directories", + "expected_files": ["backend/services/indexer_optimized.py"], + "notes": "_discover_code_files include_paths; Path.parts not str.startswith (known-bug #5)" + } + ] +} diff --git a/backend/evals/metrics.py b/backend/evals/metrics.py new file mode 100644 index 0000000..6e5c0eb --- /dev/null +++ b/backend/evals/metrics.py @@ -0,0 +1,39 @@ +"""Retrieval metrics, computed by ranx. + +ranx is imported lazily inside the function: it is a heavy optional dep and must +never be on the backend startup import path (oci known-bug #3). We buy the metric +math instead of hand-rolling it -- recall@k / MRR are a classic subtle-bug factory, +and a buggy metric makes the eval confidently wrong (worse than no eval). +""" +from typing import Dict, Sequence + + +def compute_metrics( + qrels: Dict[str, Dict[str, int]], + run: Dict[str, Dict[str, float]], + k_values: Sequence[int] = (5, 10), +) -> Dict[str, float]: + """Compute recall@k, precision@k for each k, plus MRR (over the returned set). + + qrels: {query_id: {doc_id: relevance>=1}} -- the human-labeled answer key. + run: {query_id: {doc_id: score}} -- what search returned. + """ + try: + from ranx import Qrels, Run, evaluate + except ModuleNotFoundError as e: + raise ModuleNotFoundError( + "ranx is required for eval metrics. Install dev deps: " + "pip install -r backend/requirements-dev.txt" + ) from e + + metric_names = [] + for k in k_values: + metric_names.append(f"recall@{k}") + metric_names.append(f"precision@{k}") + metric_names.append("mrr") # results are already capped at top_k, so this is MRR@top_k + + scores = evaluate(Qrels(qrels), Run(run), metric_names) + # ranx returns a bare float when a single metric is requested; a dict otherwise. + if isinstance(scores, (int, float)): + scores = {metric_names[0]: scores} + return {m: float(scores[m]) for m in metric_names} diff --git a/backend/evals/results/.gitignore b/backend/evals/results/.gitignore new file mode 100644 index 0000000..03deb27 --- /dev/null +++ b/backend/evals/results/.gitignore @@ -0,0 +1,3 @@ +# Per-run eval artifacts are not committed; the durable comparison point is baseline.json. +* +!.gitignore diff --git a/backend/evals/runner.py b/backend/evals/runner.py new file mode 100644 index 0000000..6aac444 --- /dev/null +++ b/backend/evals/runner.py @@ -0,0 +1,153 @@ +"""Offline retrieval-quality eval over the live search_v2 ranker (OCI #312). + +Deterministic by construction: a fixed query set + a fixed index + no result cache. +The indexer singleton and ranx are imported INSIDE functions so this module never +touches the backend startup path (oci known-bug #3). + +Tiers (Cohere reranking is pro-only -- you pay for Cohere): + reranking=False -> free tier (deterministic BM25 + vector core ranker, no Cohere) + reranking=True -> pro tier (Cohere rerank; requires COHERE_API_KEY) +""" +import asyncio +import json +import os +from datetime import datetime, timezone +from pathlib import Path +from typing import TYPE_CHECKING, Any, Dict, List, Sequence, Tuple + +if TYPE_CHECKING: # typing-only import; avoids the heavy runtime import (known-bug #3) + from services.indexer_optimized import OptimizedCodeIndexer + +EVAL_DIR = Path(__file__).parent +GROUND_TRUTH_PATH = EVAL_DIR / "ground_truth" / "queries.json" +BASELINE_PATH = EVAL_DIR / "baseline.json" +RESULTS_DIR = EVAL_DIR / "results" +DEFAULT_REPO_ID = os.getenv("OCI_EVAL_REPO_ID", "78aa181e-2bbb-438b-97ee-9ffd494c4815") +TOP_K = 10 +K_VALUES: Sequence[int] = (5, 10) + + +def load_ground_truth(path: Path = GROUND_TRUTH_PATH) -> List[dict]: + return json.loads(path.read_text())["queries"] + + +def load_baseline(path: Path = BASELINE_PATH) -> dict: + return json.loads(path.read_text()) + + +def _dedupe_files_by_rank(results: List[dict]) -> List[Tuple[str, float]]: + """search_v2 returns function-level hits; collapse to file-level, keeping the + best (first) rank per file. Returns [(file_path, score)] in rank order.""" + seen: Dict[str, float] = {} + for r in results: + fp = r.get("file_path") or "" + if fp and fp not in seen: + seen[fp] = float(r.get("score", 0.0)) + return list(seen.items()) + + +async def _preflight(indexer: "OptimizedCodeIndexer", repo_id: str) -> bool: + """Fail-closed corpus check: does the index actually have vectors for this repo? + + Guards against measuring a stale/empty index (known-bug #4: Pinecone is eventual + and may be unprovisioned). Reranking off so the probe is cheap and never needs + Cohere. A missing index or a swallowed search error both surface as 0 hits -> skip. + """ + if getattr(indexer, "index", None) is None: + return False + try: + probe = await indexer.search_v2(query="function", repo_id=repo_id, top_k=1, use_reranking=False) + return len(probe) > 0 + except Exception: + return False + + +def _write_results(path: Path, payload: str) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(payload) + + +# returns a heterogeneous result dict (metrics + per-query breakdown + metadata); Any is intentional +async def run_eval(reranking: bool, repo_id_default: str = DEFAULT_REPO_ID) -> Dict[str, Any]: + """Run the full query set through search_v2 and compute metrics for one tier.""" + from dependencies import indexer # isolated import (known-bug #3) + + from .metrics import compute_metrics + + if reranking and not os.getenv("COHERE_API_KEY"): + raise RuntimeError( + "Pro-tier (reranked) eval requested but COHERE_API_KEY is not set. " + "Cohere reranking is pro-only; set the key or run free-tier (--free-only)." + ) + + queries = load_ground_truth() + qrels: Dict[str, Dict[str, int]] = {} + run: Dict[str, Dict[str, float]] = {} + per_query: List[dict] = [] + empties: List[str] = [] + skipped_repos = set() + preflight_cache: Dict[str, bool] = {} + + for q in queries: + qid = str(q["query_id"]) + repo_id = q.get("repo_id") or repo_id_default + + if repo_id not in preflight_cache: + preflight_cache[repo_id] = await _preflight(indexer, repo_id) + if not preflight_cache[repo_id]: + skipped_repos.add(repo_id) + print( + f"[SKIP-LOUD] repo {repo_id} has no vectors in the index; skipping " + f"query {qid} (measuring it would corrupt recall). Index the repo first." + ) + continue + + expected = list(q["expected_files"]) + qrels[qid] = {fp: 1 for fp in expected} + + results = await indexer.search_v2( + query=q["query"], repo_id=repo_id, top_k=TOP_K, use_reranking=reranking + ) + if not results: + # FM-3: search_v2 swallows exceptions and returns []. We cannot tell a true + # no-hit from a swallowed error here, so flag it loudly instead of hiding it. + empties.append(qid) + + ranked = _dedupe_files_by_rank(results) + ranked_paths = [fp for fp, _ in ranked] + run[qid] = {fp: score for fp, score in ranked} or {"__no_results__": 0.0} + + expected_ranks = { + fp: (ranked_paths.index(fp) + 1 if fp in ranked_paths else None) for fp in expected + } + per_query.append( + { + "query_id": qid, + "query": q["query"], + "repo_id": repo_id, + "expected_files": expected, + "returned_files": ranked_paths, + "expected_ranks": expected_ranks, + "empty_result": qid in empties, + } + ) + + metrics = compute_metrics(qrels, run, K_VALUES) if qrels else {} + + tier = "pro_reranked" if reranking else "free_core" + ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ") + out = { + "timestamp": ts, + "tier": tier, + "reranking": reranking, + "n_queries_scored": len(qrels), + "n_empty_results": len(empties), + "empty_query_ids": empties, + "skipped_repos": sorted(skipped_repos), + "metrics": metrics, + "per_query": per_query, + } + results_path = RESULTS_DIR / f"eval_{tier}_{ts}.json" + # write off the event loop so this async function stays non-blocking (backend rule) + await asyncio.to_thread(_write_results, results_path, json.dumps(out, indent=2)) + return out diff --git a/backend/evals/test_retrieval_quality.py b/backend/evals/test_retrieval_quality.py new file mode 100644 index 0000000..ce87460 --- /dev/null +++ b/backend/evals/test_retrieval_quality.py @@ -0,0 +1,36 @@ +"""Retrieval-quality regression gate (OCI #312). + +This lives under backend/evals/ (NOT backend/tests/) on purpose: tests/conftest.py +mocks Pinecone + OpenAI globally via autouse fixtures, which would force recall to 0. +This gate needs the REAL index, so it runs against live services and is excluded from +the mocked `pytest tests/` suite. Run it explicitly: + + cd backend && pytest evals/ -v + +It self-skips unless real creds are present AND a baseline has been calibrated. +""" +import asyncio +import os + +import pytest + +from evals.runner import load_baseline, run_eval + +_REQUIRED_ENV = ("OPENAI_API_KEY", "PINECONE_API_KEY") + +pytestmark = pytest.mark.skipif( + not all(os.getenv(k) for k in _REQUIRED_ENV), + reason="retrieval eval needs real OPENAI_API_KEY + PINECONE_API_KEY + a populated index", +) + + +def test_recall_at_10_meets_baseline() -> None: + baseline = load_baseline() + if not baseline.get("calibrated"): + pytest.skip("baseline not calibrated yet; run `python -m evals` and record numbers first") + + out = asyncio.run(run_eval(reranking=False)) + tol = baseline.get("tolerance", 0.05) + floor = baseline["free_core"]["recall@10"] - tol + actual = out["metrics"].get("recall@10", 0.0) + assert actual >= floor, f"recall@10 {actual:.4f} below baseline floor {floor:.4f}" diff --git a/backend/requirements-dev.txt b/backend/requirements-dev.txt new file mode 100644 index 0000000..149e8fd --- /dev/null +++ b/backend/requirements-dev.txt @@ -0,0 +1,5 @@ +# Dev / eval-only dependencies. NOT installed in production. +# Kept separate from requirements.txt so heavy eval deps never reach the backend +# startup path (oci known-bug #3: optional imports must not crash startup). +# Install with: pip install -r backend/requirements-dev.txt +ranx>=0.3.20