refactor: address review feedback on eval harness types and async I/O (#312)

DevanshuNEU · DevanshuNEU · commit db87f0acb016 · 2026-06-11T00:09:04.000-04:00
- _print_table / run_eval: dict -> Dict[str, Any] (heterogeneous result dict; Any is intentional and commented per the backend no-bare-Any rule) - _preflight: type the indexer param via a TYPE_CHECKING guard so the precise type costs nothing at runtime (preserves known-bug #3 import isolation) - test: add -> None return annotation - results write: run Path.write_text off the event loop via asyncio.to_thread so run_eval stays non-blocking (backend async rule) Skipped the nitpick converting load_ground_truth/load_baseline to async aiofiles: one-time sub-ms config reads in an offline batch CLI with no loop concurrency; not worth a new runtime dep + cascading async into a sync test.
diff --git a/backend/evals/__main__.py b/backend/evals/__main__.py
@@ -7,11 +7,13 @@
 """
 import argparse
 import asyncio
+from typing import Any, Dict
 
 from .runner import run_eval
 
 
-def _print_table(label: str, out: dict) -> None:
+# out is the heterogeneous result dict from run_eval; Any is intentional
+def _print_table(label: str, out: Dict[str, Any]) -> None:
     print(f"\n=== {label}  (tier={out['tier']}, reranking={out['reranking']}) ===")
     print(
         f"queries scored: {out['n_queries_scored']}   "
diff --git a/backend/evals/runner.py b/backend/evals/runner.py
@@ -8,11 +8,15 @@
   reranking=False -> free tier  (deterministic BM25 + vector core ranker, no Cohere)
   reranking=True  -> pro tier   (Cohere rerank; requires COHERE_API_KEY)
 """
+import asyncio
 import json
 import os
 from datetime import datetime, timezone
 from pathlib import Path
-from typing import Dict, List, Sequence, Tuple
+from typing import TYPE_CHECKING, Any, Dict, List, Sequence, Tuple
+
+if TYPE_CHECKING:  # typing-only import; avoids the heavy runtime import (known-bug #3)
+    from services.indexer_optimized import OptimizedCodeIndexer
 
 EVAL_DIR = Path(__file__).parent
 GROUND_TRUTH_PATH = EVAL_DIR / "ground_truth" / "queries.json"
@@ -42,7 +46,7 @@ def _dedupe_files_by_rank(results: List[dict]) -> List[Tuple[str, float]]:
     return list(seen.items())
 
 
-async def _preflight(indexer, repo_id: str) -> bool:
+async def _preflight(indexer: "OptimizedCodeIndexer", repo_id: str) -> bool:
     """Fail-closed corpus check: does the index actually have vectors for this repo?
 
     Guards against measuring a stale/empty index (known-bug #4: Pinecone is eventual
@@ -58,7 +62,13 @@ async def _preflight(indexer, repo_id: str) -> bool:
         return False
 
 
-async def run_eval(reranking: bool, repo_id_default: str = DEFAULT_REPO_ID) -> dict:
+def _write_results(path: Path, payload: str) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(payload)
+
+
+# returns a heterogeneous result dict (metrics + per-query breakdown + metadata); Any is intentional
+async def run_eval(reranking: bool, repo_id_default: str = DEFAULT_REPO_ID) -> Dict[str, Any]:
     """Run the full query set through search_v2 and compute metrics for one tier."""
     from dependencies import indexer  # isolated import (known-bug #3)
 
@@ -137,6 +147,7 @@ async def run_eval(reranking: bool, repo_id_default: str = DEFAULT_REPO_ID) -> d
         "metrics": metrics,
         "per_query": per_query,
     }
-    RESULTS_DIR.mkdir(exist_ok=True)
-    (RESULTS_DIR / f"eval_{tier}_{ts}.json").write_text(json.dumps(out, indent=2))
+    results_path = RESULTS_DIR / f"eval_{tier}_{ts}.json"
+    # write off the event loop so this async function stays non-blocking (backend rule)
+    await asyncio.to_thread(_write_results, results_path, json.dumps(out, indent=2))
     return out
diff --git a/backend/evals/test_retrieval_quality.py b/backend/evals/test_retrieval_quality.py
@@ -24,7 +24,7 @@
 )
 
 
-def test_recall_at_10_meets_baseline():
+def test_recall_at_10_meets_baseline() -> None:
     baseline = load_baseline()
     if not baseline.get("calibrated"):
         pytest.skip("baseline not calibrated yet; run `python -m evals` and record numbers first")

Original file line number	Diff line number	Diff line change
`@@ -24,7 +24,7 @@`
`24`	`24`	`)`
`25`	`25`
`26`	`26`
`27`		`-def test_recall_at_10_meets_baseline():`
	`27`	`+def test_recall_at_10_meets_baseline() -> None:`
`28`	`28`	`baseline = load_baseline()`
`29`	`29`	`if not baseline.get("calibrated"):`
`30`	`30`	pytest.skip("baseline not calibrated yet; run `python -m evals` and record numbers first")