From 94872fde487b887a9a873f4ba6106f6540890a6b Mon Sep 17 00:00:00 2001
From: Thor Whalen <1906276+thorwhalen@users.noreply.github.com>
Date: Sat, 6 Jun 2026 13:02:03 +0200
Subject: [PATCH 1/2] perf(retrieve): cache BM25 index per corpus (was rebuilt
 per query)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

_lexical_ranked built the candidate collection and recomputed all BM25
term statistics on every query, making lexical/hybrid O(corpus*queries)
— they did not scale (a full lexical/hybrid pass over the 16,836-record
reports corpus did not finish in 10 min CPU; ir_05 §6.1). Now a
vd.BM25Index is built once per candidate set and cached on the corpus
instance (keyed by surfaces+filter); the index's query-independent
statistics are reused across queries. Behavior-preserving: skills strict
reproduces dense 0.6585 / lexical 0.6407 / hybrid 0.6941 exactly.

Needs vd.BM25Index (i2mint/vd#22).

Closes #21. Refs #12
---
 ir/retrieve.py | 94 +++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 74 insertions(+), 20 deletions(-)

diff --git a/ir/retrieve.py b/ir/retrieve.py
index bb3099a..f23d72a 100644
--- a/ir/retrieve.py
+++ b/ir/retrieve.py
@@ -26,6 +26,7 @@
 
 from __future__ import annotations
 
+import json
 import warnings
 from collections.abc import Iterable, Mapping
 from types import SimpleNamespace
@@ -100,32 +101,57 @@ def _dense_ranked(
     return [(ids[candidates[o]], float(scores[o])) for o in order]
 
 
-def _lexical_ranked(
-    ids: list[str],
-    metas: list[dict],
-    candidates: list[int],
-    query: str,
-    fetch: int,
-    bm25: Mapping[str, Any] | None,
-) -> list[tuple[str, float]]:
-    """Top-*fetch* ``(record_id, bm25)`` pairs via ``vd.bm25_lexical_search``.
+def _candidate_key(
+    surfaces: tuple[str, ...] | None, filter: Mapping[str, Any] | None
+) -> tuple:
+    """A hashable key identifying a candidate set (surface restriction + filter).
+
+    For a fixed :class:`~ir.index.Corpus` instance the candidate set is fully
+    determined by the surface restriction and the hard metadata filter, so two
+    searches sharing both can share one BM25 index. The filter is canonicalized
+    with sorted keys so logically-equal filters land on the same cache entry.
+    """
+    f = json.dumps(filter, sort_keys=True, default=str) if filter is not None else None
+    return (surfaces, f)
+
+
+def _bm25_index_for(corpus, ids, metas, candidates, cache_key):
+    """A ``vd.BM25Index`` over the candidate texts, cached on the corpus instance.
+
+    BM25's term statistics (document frequencies, lengths) are
+    *query-independent*, so the index is built once per candidate set and reused
+    across queries. Rebuilding it on every query — re-tokenizing every document
+    each time — is the lexical/hybrid bottleneck for batch evaluation and the
+    reason large corpora did not scale. The cache lives on the corpus instance,
+    which is immutable during its lifetime and freshly created whenever the
+    corpus is (re)built or reopened, so the cache never goes stale.
 
     The candidate texts are exposed to ``vd`` as a zero-copy mapping view
     (``record_id -> obj`` with ``.text`` / ``.metadata``) so no vectors are
-    duplicated and BM25 runs only over the already-filtered candidates. Returns
-    ``[]`` (with a warning) if ``vd``'s lexical search is unavailable, letting
-    hybrid degrade to dense.
+    duplicated and BM25 covers only the already-filtered candidates. Returns
+    ``None`` (with a warning) if ``vd`` is unavailable, letting lexical mode
+    return nothing and hybrid degrade to dense.
     """
     try:
-        from vd import bm25_lexical_search
+        from vd import BM25Index
     except Exception:
         warnings.warn(
-            "vd.bm25_lexical_search unavailable; BM25 lexical ranking is "
-            "skipped. Hybrid falls back to dense; lexical mode returns no "
-            "results. Install `vd` for lexical/hybrid retrieval.",
+            "vd.BM25Index unavailable; BM25 lexical ranking is skipped. Hybrid "
+            "falls back to dense; lexical mode returns no results. Install `vd` "
+            "for lexical/hybrid retrieval.",
             stacklevel=3,
         )
-        return []
+        return None
+
+    cache = getattr(corpus, "_bm25_index_cache", None)
+    if cache is None:
+        cache = {}
+        try:
+            corpus._bm25_index_cache = cache
+        except Exception:
+            pass  # corpus doesn't admit attribute caching → build fresh each call
+    if cache_key in cache:
+        return cache[cache_key]
 
     collection = {
         ids[j]: SimpleNamespace(
@@ -133,7 +159,25 @@ def _lexical_ranked(
         )
         for j in candidates
     }
-    results = bm25_lexical_search(collection, query, limit=fetch, **(bm25 or {}))
+    index = BM25Index(collection)
+    cache[cache_key] = index
+    return index
+
+
+def _lexical_ranked(
+    index,
+    query: str,
+    fetch: int,
+    bm25: Mapping[str, Any] | None,
+) -> list[tuple[str, float]]:
+    """Top-*fetch* ``(record_id, bm25_score)`` pairs from a prebuilt BM25 index.
+
+    ``index`` is a ``vd.BM25Index`` (see :func:`_bm25_index_for`) or ``None``
+    when ``vd`` is unavailable, in which case lexical ranking is empty.
+    """
+    if index is None:
+        return []
+    results = index.search(query, limit=fetch, **(bm25 or {}))
     return [(r["id"], float(r["score"])) for r in results]
 
 
@@ -233,6 +277,10 @@ def search(
     if mode not in MODES:
         raise ValueError(f"unknown mode {mode!r}; expected one of {MODES}")
 
+    # Materialize once: surfaces may be a one-shot iterable, and it is both the
+    # filter input and part of the BM25 cache key.
+    surfaces = tuple(surfaces) if surfaces is not None else None
+
     ids, mat, metas = corpus.store.matrix()
     if not ids:
         return []
@@ -244,13 +292,19 @@ def search(
     fetch = fetch_k if fetch_k is not None else (max(k * 5, 50) if per_artifact else k)
 
     if mode == "lexical":
-        ranked = _lexical_ranked(ids, metas, candidates, query, fetch, bm25)
+        index = _bm25_index_for(
+            corpus, ids, metas, candidates, _candidate_key(surfaces, filter)
+        )
+        ranked = _lexical_ranked(index, query, fetch, bm25)
     else:
         dense = _dense_ranked(corpus, mat, ids, candidates, query, fetch)
         if mode == "dense":
             ranked = dense
         else:  # hybrid
-            lexical = _lexical_ranked(ids, metas, candidates, query, fetch, bm25)
+            index = _bm25_index_for(
+                corpus, ids, metas, candidates, _candidate_key(surfaces, filter)
+            )
+            lexical = _lexical_ranked(index, query, fetch, bm25)
             ranked = _rrf_fuse(dense, lexical, rrf_k, fetch)
 
     meta_by_id = {ids[j]: metas[j] for j in candidates}

From 5f239f554736aa0b1f411b6ea7a6f3ee6f864a42 Mon Sep 17 00:00:00 2001
From: Thor Whalen <1906276+thorwhalen@users.noreply.github.com>
Date: Sat, 6 Jun 2026 13:05:27 +0200
Subject: [PATCH 2/2] =?UTF-8?q?docs(eval):=20note=20BM25=20caching=20fix?=
 =?UTF-8?q?=20in=20ir=5F05=20(=C2=A76.1)=20+=20full=20reports=20n=3D600?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Records that the per-query BM25 rebuild (ir_05 §6.1) is resolved, and
upgrades the reports row to the full 600-gold run (now feasible): hybrid
0.537 > dense 0.500 > lexical 0.401. Refs #21 #12
---
 ... Run -- Findings (Dense vs Lexical vs Hybrid).md | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/misc/docs/ir_05 -- Capability-Discovery Eval Run -- Findings (Dense vs Lexical vs Hybrid).md b/misc/docs/ir_05 -- Capability-Discovery Eval Run -- Findings (Dense vs Lexical vs Hybrid).md
index bc755bd..239fe4d 100644
--- a/misc/docs/ir_05 -- Capability-Discovery Eval Run -- Findings (Dense vs Lexical vs Hybrid).md	
+++ b/misc/docs/ir_05 -- Capability-Discovery Eval Run -- Findings (Dense vs Lexical vs Hybrid).md	
@@ -66,8 +66,9 @@ All corpora were built with the production `all-MiniLM-L6-v2` embedder.
 
 > **Sampling caveat.** packages/reports were capped at 120 artifacts (`--max-artifacts`,
 > sorted-id order for skills/packages; the reports sample spans many top-level path
-> prefixes, so it is not degenerate). The reports *scoring* was further restricted
-> to a random 100-gold sample because of the BM25 scaling issue (§6.1).
+> prefixes, so it is not degenerate). The reports figures above are the full
+> 600-gold run (≈3 min after the BM25 caching fix, §6.1); the original run was
+> restricted to 100 gold because lexical/hybrid did not scale before that fix.
 
 ---
 
@@ -80,7 +81,7 @@ All corpora were built with the production `all-MiniLM-L6-v2` embedder.
 | **skills** — judge-clean | 704 | 0.728 | 0.709 | **0.772** | hybrid |
 | **skills** — judge-graded | 719 | 0.733 | 0.684 | **0.765** | hybrid |
 | **packages** — strict | 582 | **0.583** | 0.305 | 0.489 | dense |
-| **reports** — strict (n=100) | 100 | 0.431 | 0.367 | **0.477** | hybrid |
+| **reports** — strict | 600 | 0.500 | 0.401 | **0.537** | hybrid |
 
 "Lenses" are four increasingly-fair definitions of the gold set for skills (§4).
 The mode **ordering is identical across all four skills lenses** — the result is
@@ -172,6 +173,12 @@ and down-sampled to 100 queries. **Opportunity (ir/vd):** build/persist the BM25
 index once per corpus (alongside the dense matrix) and reuse it across queries —
 this is the single biggest blocker to running `ir` at corpus scale.
 
+> **Update — fixed.** Resolved in ir #21 (`ir.retrieve` caches a per-corpus
+> `vd.BM25Index`, keyed by surfaces+filter) on top of vd #22/#23 (`vd.BM25Index`,
+> build-once / query-many). The full 600-query reports run now completes in
+> ~3 min (all three modes, vs >10 min killed before); skills strict reproduces
+> the pre-fix numbers exactly, so the change is behavior-preserving.
+
 ### 6.2 Near-duplicate artifacts in the skills corpus
 27% of skills are indexed twice (global + package-scoped) with near-identical
 descriptions. This silently deflates single-gold scores and adds noise to any A/B.