From 94872fde487b887a9a873f4ba6106f6540890a6b Mon Sep 17 00:00:00 2001 From: Thor Whalen <1906276+thorwhalen@users.noreply.github.com> Date: Sat, 6 Jun 2026 13:02:03 +0200 Subject: [PATCH 1/2] perf(retrieve): cache BM25 index per corpus (was rebuilt per query) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit _lexical_ranked built the candidate collection and recomputed all BM25 term statistics on every query, making lexical/hybrid O(corpus*queries) — they did not scale (a full lexical/hybrid pass over the 16,836-record reports corpus did not finish in 10 min CPU; ir_05 §6.1). Now a vd.BM25Index is built once per candidate set and cached on the corpus instance (keyed by surfaces+filter); the index's query-independent statistics are reused across queries. Behavior-preserving: skills strict reproduces dense 0.6585 / lexical 0.6407 / hybrid 0.6941 exactly. Needs vd.BM25Index (i2mint/vd#22). Closes #21. Refs #12 --- ir/retrieve.py | 94 +++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 74 insertions(+), 20 deletions(-) diff --git a/ir/retrieve.py b/ir/retrieve.py index bb3099a..f23d72a 100644 --- a/ir/retrieve.py +++ b/ir/retrieve.py @@ -26,6 +26,7 @@ from __future__ import annotations +import json import warnings from collections.abc import Iterable, Mapping from types import SimpleNamespace @@ -100,32 +101,57 @@ def _dense_ranked( return [(ids[candidates[o]], float(scores[o])) for o in order] -def _lexical_ranked( - ids: list[str], - metas: list[dict], - candidates: list[int], - query: str, - fetch: int, - bm25: Mapping[str, Any] | None, -) -> list[tuple[str, float]]: - """Top-*fetch* ``(record_id, bm25)`` pairs via ``vd.bm25_lexical_search``. +def _candidate_key( + surfaces: tuple[str, ...] | None, filter: Mapping[str, Any] | None +) -> tuple: + """A hashable key identifying a candidate set (surface restriction + filter). + + For a fixed :class:`~ir.index.Corpus` instance the candidate set is fully + determined by the surface restriction and the hard metadata filter, so two + searches sharing both can share one BM25 index. The filter is canonicalized + with sorted keys so logically-equal filters land on the same cache entry. + """ + f = json.dumps(filter, sort_keys=True, default=str) if filter is not None else None + return (surfaces, f) + + +def _bm25_index_for(corpus, ids, metas, candidates, cache_key): + """A ``vd.BM25Index`` over the candidate texts, cached on the corpus instance. + + BM25's term statistics (document frequencies, lengths) are + *query-independent*, so the index is built once per candidate set and reused + across queries. Rebuilding it on every query — re-tokenizing every document + each time — is the lexical/hybrid bottleneck for batch evaluation and the + reason large corpora did not scale. The cache lives on the corpus instance, + which is immutable during its lifetime and freshly created whenever the + corpus is (re)built or reopened, so the cache never goes stale. The candidate texts are exposed to ``vd`` as a zero-copy mapping view (``record_id -> obj`` with ``.text`` / ``.metadata``) so no vectors are - duplicated and BM25 runs only over the already-filtered candidates. Returns - ``[]`` (with a warning) if ``vd``'s lexical search is unavailable, letting - hybrid degrade to dense. + duplicated and BM25 covers only the already-filtered candidates. Returns + ``None`` (with a warning) if ``vd`` is unavailable, letting lexical mode + return nothing and hybrid degrade to dense. """ try: - from vd import bm25_lexical_search + from vd import BM25Index except Exception: warnings.warn( - "vd.bm25_lexical_search unavailable; BM25 lexical ranking is " - "skipped. Hybrid falls back to dense; lexical mode returns no " - "results. Install `vd` for lexical/hybrid retrieval.", + "vd.BM25Index unavailable; BM25 lexical ranking is skipped. Hybrid " + "falls back to dense; lexical mode returns no results. Install `vd` " + "for lexical/hybrid retrieval.", stacklevel=3, ) - return [] + return None + + cache = getattr(corpus, "_bm25_index_cache", None) + if cache is None: + cache = {} + try: + corpus._bm25_index_cache = cache + except Exception: + pass # corpus doesn't admit attribute caching → build fresh each call + if cache_key in cache: + return cache[cache_key] collection = { ids[j]: SimpleNamespace( @@ -133,7 +159,25 @@ def _lexical_ranked( ) for j in candidates } - results = bm25_lexical_search(collection, query, limit=fetch, **(bm25 or {})) + index = BM25Index(collection) + cache[cache_key] = index + return index + + +def _lexical_ranked( + index, + query: str, + fetch: int, + bm25: Mapping[str, Any] | None, +) -> list[tuple[str, float]]: + """Top-*fetch* ``(record_id, bm25_score)`` pairs from a prebuilt BM25 index. + + ``index`` is a ``vd.BM25Index`` (see :func:`_bm25_index_for`) or ``None`` + when ``vd`` is unavailable, in which case lexical ranking is empty. + """ + if index is None: + return [] + results = index.search(query, limit=fetch, **(bm25 or {})) return [(r["id"], float(r["score"])) for r in results] @@ -233,6 +277,10 @@ def search( if mode not in MODES: raise ValueError(f"unknown mode {mode!r}; expected one of {MODES}") + # Materialize once: surfaces may be a one-shot iterable, and it is both the + # filter input and part of the BM25 cache key. + surfaces = tuple(surfaces) if surfaces is not None else None + ids, mat, metas = corpus.store.matrix() if not ids: return [] @@ -244,13 +292,19 @@ def search( fetch = fetch_k if fetch_k is not None else (max(k * 5, 50) if per_artifact else k) if mode == "lexical": - ranked = _lexical_ranked(ids, metas, candidates, query, fetch, bm25) + index = _bm25_index_for( + corpus, ids, metas, candidates, _candidate_key(surfaces, filter) + ) + ranked = _lexical_ranked(index, query, fetch, bm25) else: dense = _dense_ranked(corpus, mat, ids, candidates, query, fetch) if mode == "dense": ranked = dense else: # hybrid - lexical = _lexical_ranked(ids, metas, candidates, query, fetch, bm25) + index = _bm25_index_for( + corpus, ids, metas, candidates, _candidate_key(surfaces, filter) + ) + lexical = _lexical_ranked(index, query, fetch, bm25) ranked = _rrf_fuse(dense, lexical, rrf_k, fetch) meta_by_id = {ids[j]: metas[j] for j in candidates} From 5f239f554736aa0b1f411b6ea7a6f3ee6f864a42 Mon Sep 17 00:00:00 2001 From: Thor Whalen <1906276+thorwhalen@users.noreply.github.com> Date: Sat, 6 Jun 2026 13:05:27 +0200 Subject: [PATCH 2/2] =?UTF-8?q?docs(eval):=20note=20BM25=20caching=20fix?= =?UTF-8?q?=20in=20ir=5F05=20(=C2=A76.1)=20+=20full=20reports=20n=3D600?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Records that the per-query BM25 rebuild (ir_05 §6.1) is resolved, and upgrades the reports row to the full 600-gold run (now feasible): hybrid 0.537 > dense 0.500 > lexical 0.401. Refs #21 #12 --- ... Run -- Findings (Dense vs Lexical vs Hybrid).md | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/misc/docs/ir_05 -- Capability-Discovery Eval Run -- Findings (Dense vs Lexical vs Hybrid).md b/misc/docs/ir_05 -- Capability-Discovery Eval Run -- Findings (Dense vs Lexical vs Hybrid).md index bc755bd..239fe4d 100644 --- a/misc/docs/ir_05 -- Capability-Discovery Eval Run -- Findings (Dense vs Lexical vs Hybrid).md +++ b/misc/docs/ir_05 -- Capability-Discovery Eval Run -- Findings (Dense vs Lexical vs Hybrid).md @@ -66,8 +66,9 @@ All corpora were built with the production `all-MiniLM-L6-v2` embedder. > **Sampling caveat.** packages/reports were capped at 120 artifacts (`--max-artifacts`, > sorted-id order for skills/packages; the reports sample spans many top-level path -> prefixes, so it is not degenerate). The reports *scoring* was further restricted -> to a random 100-gold sample because of the BM25 scaling issue (§6.1). +> prefixes, so it is not degenerate). The reports figures above are the full +> 600-gold run (≈3 min after the BM25 caching fix, §6.1); the original run was +> restricted to 100 gold because lexical/hybrid did not scale before that fix. --- @@ -80,7 +81,7 @@ All corpora were built with the production `all-MiniLM-L6-v2` embedder. | **skills** — judge-clean | 704 | 0.728 | 0.709 | **0.772** | hybrid | | **skills** — judge-graded | 719 | 0.733 | 0.684 | **0.765** | hybrid | | **packages** — strict | 582 | **0.583** | 0.305 | 0.489 | dense | -| **reports** — strict (n=100) | 100 | 0.431 | 0.367 | **0.477** | hybrid | +| **reports** — strict | 600 | 0.500 | 0.401 | **0.537** | hybrid | "Lenses" are four increasingly-fair definitions of the gold set for skills (§4). The mode **ordering is identical across all four skills lenses** — the result is @@ -172,6 +173,12 @@ and down-sampled to 100 queries. **Opportunity (ir/vd):** build/persist the BM25 index once per corpus (alongside the dense matrix) and reuse it across queries — this is the single biggest blocker to running `ir` at corpus scale. +> **Update — fixed.** Resolved in ir #21 (`ir.retrieve` caches a per-corpus +> `vd.BM25Index`, keyed by surfaces+filter) on top of vd #22/#23 (`vd.BM25Index`, +> build-once / query-many). The full 600-query reports run now completes in +> ~3 min (all three modes, vs >10 min killed before); skills strict reproduces +> the pre-fix numbers exactly, so the change is behavior-preserving. + ### 6.2 Near-duplicate artifacts in the skills corpus 27% of skills are indexed twice (global + package-scoped) with near-identical descriptions. This silently deflates single-gold scores and adds noise to any A/B.