Skip to content

Commit db87f0a

Browse files
committed
refactor: address review feedback on eval harness types and async I/O (#312)
- _print_table / run_eval: dict -> Dict[str, Any] (heterogeneous result dict; Any is intentional and commented per the backend no-bare-Any rule) - _preflight: type the indexer param via a TYPE_CHECKING guard so the precise type costs nothing at runtime (preserves known-bug #3 import isolation) - test: add -> None return annotation - results write: run Path.write_text off the event loop via asyncio.to_thread so run_eval stays non-blocking (backend async rule) Skipped the nitpick converting load_ground_truth/load_baseline to async aiofiles: one-time sub-ms config reads in an offline batch CLI with no loop concurrency; not worth a new runtime dep + cascading async into a sync test.
1 parent 1a24f31 commit db87f0a

3 files changed

Lines changed: 20 additions & 7 deletions

File tree

backend/evals/__main__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,13 @@
77
"""
88
import argparse
99
import asyncio
10+
from typing import Any, Dict
1011

1112
from .runner import run_eval
1213

1314

14-
def _print_table(label: str, out: dict) -> None:
15+
# out is the heterogeneous result dict from run_eval; Any is intentional
16+
def _print_table(label: str, out: Dict[str, Any]) -> None:
1517
print(f"\n=== {label} (tier={out['tier']}, reranking={out['reranking']}) ===")
1618
print(
1719
f"queries scored: {out['n_queries_scored']} "

backend/evals/runner.py

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,15 @@
88
reranking=False -> free tier (deterministic BM25 + vector core ranker, no Cohere)
99
reranking=True -> pro tier (Cohere rerank; requires COHERE_API_KEY)
1010
"""
11+
import asyncio
1112
import json
1213
import os
1314
from datetime import datetime, timezone
1415
from pathlib import Path
15-
from typing import Dict, List, Sequence, Tuple
16+
from typing import TYPE_CHECKING, Any, Dict, List, Sequence, Tuple
17+
18+
if TYPE_CHECKING: # typing-only import; avoids the heavy runtime import (known-bug #3)
19+
from services.indexer_optimized import OptimizedCodeIndexer
1620

1721
EVAL_DIR = Path(__file__).parent
1822
GROUND_TRUTH_PATH = EVAL_DIR / "ground_truth" / "queries.json"
@@ -42,7 +46,7 @@ def _dedupe_files_by_rank(results: List[dict]) -> List[Tuple[str, float]]:
4246
return list(seen.items())
4347

4448

45-
async def _preflight(indexer, repo_id: str) -> bool:
49+
async def _preflight(indexer: "OptimizedCodeIndexer", repo_id: str) -> bool:
4650
"""Fail-closed corpus check: does the index actually have vectors for this repo?
4751
4852
Guards against measuring a stale/empty index (known-bug #4: Pinecone is eventual
@@ -58,7 +62,13 @@ async def _preflight(indexer, repo_id: str) -> bool:
5862
return False
5963

6064

61-
async def run_eval(reranking: bool, repo_id_default: str = DEFAULT_REPO_ID) -> dict:
65+
def _write_results(path: Path, payload: str) -> None:
66+
path.parent.mkdir(parents=True, exist_ok=True)
67+
path.write_text(payload)
68+
69+
70+
# returns a heterogeneous result dict (metrics + per-query breakdown + metadata); Any is intentional
71+
async def run_eval(reranking: bool, repo_id_default: str = DEFAULT_REPO_ID) -> Dict[str, Any]:
6272
"""Run the full query set through search_v2 and compute metrics for one tier."""
6373
from dependencies import indexer # isolated import (known-bug #3)
6474

@@ -137,6 +147,7 @@ async def run_eval(reranking: bool, repo_id_default: str = DEFAULT_REPO_ID) -> d
137147
"metrics": metrics,
138148
"per_query": per_query,
139149
}
140-
RESULTS_DIR.mkdir(exist_ok=True)
141-
(RESULTS_DIR / f"eval_{tier}_{ts}.json").write_text(json.dumps(out, indent=2))
150+
results_path = RESULTS_DIR / f"eval_{tier}_{ts}.json"
151+
# write off the event loop so this async function stays non-blocking (backend rule)
152+
await asyncio.to_thread(_write_results, results_path, json.dumps(out, indent=2))
142153
return out

backend/evals/test_retrieval_quality.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
)
2525

2626

27-
def test_recall_at_10_meets_baseline():
27+
def test_recall_at_10_meets_baseline() -> None:
2828
baseline = load_baseline()
2929
if not baseline.get("calibrated"):
3030
pytest.skip("baseline not calibrated yet; run `python -m evals` and record numbers first")

0 commit comments

Comments
 (0)