diff --git a/README.md b/README.md index c5ca49c..39198a0 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,8 @@ is the central selection risk — fewer, better candidates beat more), and import ir # Define a corpus, build the index (incremental), then discover: -source = ir.CorpusSource.from_skills() # or from_packages(), from_md_reports(), from_files(...) +source = ir.CorpusSource.from_skills() # or from_packages(), from_md_reports(), + # from_claude_sessions(), from_files(...) corpus = ir.build(source) # embed + persist under XDG dirs result = ir.discover(corpus, "how do I deploy the app to the server") @@ -242,8 +243,11 @@ ir search skills "deploy the app" # rank candidates (retrieval only) ir discover skills "deploy the app" # retrieve -> select ir discover skills "deploy the app" --disclose # + load bodies ir discover skills "deploy the app" --min-score auto # + calibrated abstention +ir build sessions # index recent Claude Code sessions (turn pairs) +ir search sessions "numpy abi error" --mode lexical # find past sessions ir ls # list corpora + record counts -ir info skills # config, stats, calibrated floors +ir info skills # config, stats, policy, calibrated floors +ir maintain --all # run due background work (idempotent; cron-friendly) ir register notes files --root ~/notes --pattern '.*\.md$' # register a custom corpus ir rm notes # unregister (keeps built data) ir eval-gen skills skills_eval.jsonl # generate eval cases (needs oa/LLM) diff --git a/ir/__init__.py b/ir/__init__.py index f9a43c1..2595670 100644 --- a/ir/__init__.py +++ b/ir/__init__.py @@ -45,7 +45,15 @@ default_edge_extractor, ) from .index import Corpus, build, open_corpus -from .registry import retriever_for, retrievers +from .maintenance import MaintenanceResult, maintain, maintain_corpus +from .policy import ( + MaintenancePolicy, + ReindexPolicy, + SynopsisPolicy, + default_policy_for_kind, + resolve_policy, +) +from .registry import policy_for, retriever_for, retrievers from .retrieve import Retriever, as_retriever, fuse_hits, records_for_artifact from .retrieve import search as _search from .select import ( @@ -58,7 +66,14 @@ ) from .sources import CorpusSource from .store import CorpusStore -from .strategy import Chunked, IndexingStrategy, Package, Skill, WholeText +from .strategy import ( + Chunked, + ClaudeTurn, + IndexingStrategy, + Package, + Skill, + WholeText, +) from .synopsis import Synthesizer, make_llm_synthesizer, with_synopsis from .traverse import WalkPolicy, WalkState, collapsed_tree_policy, traverse @@ -73,6 +88,7 @@ "Chunked", "Skill", "Package", + "ClaudeTurn", "with_synopsis", "make_llm_synthesizer", "Synthesizer", @@ -114,6 +130,15 @@ "register", "corpora", "build_corpus", + "maintain", + "maintain_corpus", + "MaintenanceResult", + "MaintenancePolicy", + "ReindexPolicy", + "SynopsisPolicy", + "resolve_policy", + "default_policy_for_kind", + "policy_for", ] register = registry.register @@ -148,6 +173,6 @@ def search(corpus, query, **kwargs): # The evaluation harness is reachable as ``ir.eval`` (its ``ef`` imports are # lazy, so this does not weigh down ``import ir``). Kept out of ``__all__`` so a # star-import does not shadow the ``eval`` builtin. ``ir.eval_gen`` is the -# build-time case generator (its ``oa`` import is lazy too). +# build-time case generator (its ``aix`` import is lazy too). from . import eval # noqa: E402,F401 (submodule attribute: ir.eval) from . import eval_gen # noqa: E402,F401 (submodule attribute: ir.eval_gen) diff --git a/ir/__main__.py b/ir/__main__.py index 9d73db7..1fabd92 100644 --- a/ir/__main__.py +++ b/ir/__main__.py @@ -10,7 +10,28 @@ def main(): import argh parser = argh.ArghParser() - argh.add_commands(parser, COMMANDS) + # argh >= 0.30 requires an explicit name-mapping policy as soon as a command + # has an *optional positional* (e.g. ``maintain(name=None, ...)``). + # ``BY_NAME_IF_KWONLY`` keeps positional params positional and maps + # keyword-only params to options — exactly ir's existing command convention, + # so it changes nothing for the other commands. Fall back gracefully on + # older argh that lacks the policy. + try: + policy = argh.NameMappingPolicy.BY_NAME_IF_KWONLY + except AttributeError: + try: + from argh.assembling import NameMappingPolicy + + policy = NameMappingPolicy.BY_NAME_IF_KWONLY + except ImportError: + policy = None + try: + if policy is not None: + argh.add_commands(parser, COMMANDS, name_mapping_policy=policy) + else: + argh.add_commands(parser, COMMANDS) + except TypeError: # very old argh without the kwarg + argh.add_commands(parser, COMMANDS) parser.dispatch() diff --git a/ir/cli.py b/ir/cli.py index 9a7b8b5..75cb201 100644 --- a/ir/cli.py +++ b/ir/cli.py @@ -11,7 +11,7 @@ ir info packages # config + stats for a corpus ir register notes files --root ~/notes --pattern '.*\\.md$' ir rm notes # unregister (keeps built data) - ir eval-gen skills skills_eval.jsonl --k 5 # generate cases (needs oa/LLM) + ir eval-gen skills skills_eval.jsonl --k 5 # generate cases (needs aix/LLM) ir eval skills skills_eval.jsonl --mode hybrid # score retrieval on a case file ir eval-select skills skills_eval.jsonl # score the selection stage ir sweep-select skills skills_eval.jsonl # tune max_k/rel for the selector @@ -63,14 +63,38 @@ def ls(): return "\n".join(lines) -def register(name, kind, *, root=None, pattern=None, embedder="default"): - """Register a named corpus. kind: skills | packages | reports | files.""" +def register( + name, + kind, + *, + root=None, + pattern=None, + embedder="default", + reindex_on=None, + every_hours=None, + synopsis=False, +): + """Register a named corpus. kind: skills | packages | reports | files. + + Background-work policy (optional; smart per-kind defaults otherwise — see + `ir.policy`): reindex_on (source-change | interval | manual), every_hours (for + interval), synopsis (enable LLM synopses, run only in the policy's downtime + window by `ir maintain`). + """ params = {} if root: params["root"] = root if pattern: params["pattern"] = pattern - registry.register(name, kind, embedder=embedder, **params) + maintenance = None + if reindex_on or every_hours or synopsis: + reindex = {} + if every_hours: + reindex = {"on": "interval", "every_hours": float(every_hours)} + elif reindex_on: + reindex = {"on": reindex_on} + maintenance = {"reindex": reindex, "synopsis": {"enabled": bool(synopsis)}} + registry.register(name, kind, embedder=embedder, maintenance=maintenance, **params) return f"registered {name!r} (kind={kind}, embedder={embedder})" @@ -158,18 +182,45 @@ def discover( def info(name): - """Show a corpus's stored config, stats, and any calibrated abstention floors.""" + """Show a corpus's stored config, stats, policy, and any abstention floors.""" corpus = open_corpus(name) cfg = corpus.store.get_config() reg = registry.get(name) calibrated = corpus.store.calibration_modes() floors = {m: corpus.store.get_calibration(m).get("min_score") for m in calibrated} cal = f"\nmin_score floors: {floors}" if floors else "" + pol = registry.policy_for(name) + state = corpus.store.get_maintenance_state() + last = state.get("last_maintained", "never") + syn = pol.synopsis + syn_str = f"enabled, scope={syn.scope}/{syn.window_days}d" if syn.enabled else "off" + window = f", downtime={syn.downtime_hours}" if syn.downtime_hours else "" + policy_str = ( + f"\npolicy: reindex={pol.reindex.on}" + + (f"/{pol.reindex.every_hours}h" if pol.reindex.every_hours else "") + + f", synopsis={syn_str}{window}\nlast maintained: {last}" + ) return ( - f"name: {name}\nregistered: {reg}\nrecords: {len(corpus)}\nconfig: {cfg}{cal}" + f"name: {name}\nregistered: {reg}\nrecords: {len(corpus)}\n" + f"config: {cfg}{policy_str}{cal}" ) +def maintain(name=None, *, all=False, dry_run=False): + """Run due background work: incremental reindex, synopsis in its downtime window. + + With a name, maintains that corpus; with --all (or no name), every registered + corpus. Idempotent and safe to schedule (cron/launchd): it no-ops what is not + due. --dry-run reports what would run without doing it. + """ + from .maintenance import maintain as _maintain + + results = _maintain(name=name, all=all, dry_run=dry_run) + if not results: + return "no corpora registered" + return "\n".join(str(r) for r in results) + + def rm(name): """Unregister a corpus (does not delete its built data).""" registry.unregister(name) @@ -202,12 +253,12 @@ def eval(name, cases, *, mode="hybrid", k=10): def eval_gen(name, out, *, k=5, abstention_frac=0.15, max_artifacts=None): - """Generate an eval-case file for a corpus by back-translation (needs oa/LLM). + """Generate an eval-case file for a corpus by back-translation (needs aix/LLM). Writes a DiscoveryCase JSONL set (gold cases + an abstention slice) for the registered corpus *name* to *out*, stamping a corpus-signature into the header so the frozen file can be checked against the live corpus later. This - command calls an LLM via oa; scoring it afterwards (`ir eval`) is offline. + command calls an LLM via aix; scoring it afterwards (`ir eval`) is offline. """ from .eval import save_cases from .eval_gen import build_eval_set, corpus_signature @@ -375,6 +426,7 @@ def calibrate_min_score( search, discover, info, + maintain, rm, eval, eval_gen, diff --git a/ir/eval_gen.py b/ir/eval_gen.py index 5922b0e..5cf521d 100644 --- a/ir/eval_gen.py +++ b/ir/eval_gen.py @@ -19,9 +19,9 @@ The LLM is **injected** (`query_generator` / `abstention_generator` callables), so the generation *logic* — masking, gold assignment, the leakage guard, the abstention fraction — is fully testable with a deterministic stub and no network. -The default generators are built lazily on :mod:`oa` (`oa.prompt_function`), so -``import ir.eval_gen`` stays cheap and offline; ``oa`` is only imported when you -actually generate with the real LLM. +The default generators are built lazily on :mod:`aix` (`aix.prompt_func`, the +multi-provider LLM facade), so ``import ir.eval_gen`` stays cheap and offline; +``aix`` is only imported when you actually generate with the real LLM. The output is plain :class:`~ir.eval.DiscoveryCase` data — freeze it with :func:`ir.eval.save_cases` (stamping :func:`corpus_signature` into the @@ -34,7 +34,7 @@ from ir import eval_gen as eg source = ir.CorpusSource.from_skills() - cases = eg.build_eval_set(source, k=5, corpus_name="skills") # uses oa + cases = eg.build_eval_set(source, k=5, corpus_name="skills") # uses aix from ir.eval import save_cases save_cases(cases, "skills_eval.jsonl", meta={"corpus": "skills", "corpus_signature": eg.corpus_signature(source)}) @@ -173,7 +173,7 @@ def _leaks_name(text: str, name: str) -> bool: # =========================================================================== # -# Default (oa-backed) generators — lazily built, only when actually used +# Default (aix-backed) generators — lazily built, only when actually used # =========================================================================== # @@ -199,13 +199,13 @@ def _parse_lines(text: Any) -> list[str]: return lines -def make_oa_query_generator( +def make_default_query_generator( *, prompt: str = BACKTRANSLATION_PROMPT, **prompt_function_kwargs: Any ) -> QueryGenerator: - """Build the default back-translation generator on :mod:`oa` (lazy import).""" - import oa + """Build the default back-translation generator on :mod:`aix` (lazy import).""" + import aix - fn = oa.prompt_function( + fn = aix.prompt_func( prompt, egress=_parse_lines, name="backtranslate", **prompt_function_kwargs ) @@ -215,13 +215,13 @@ def generate(description: str, *, n: int) -> list[str]: return generate -def make_oa_abstention_generator( +def make_default_abstention_generator( *, prompt: str = ABSTENTION_PROMPT, **prompt_function_kwargs: Any ) -> AbstentionGenerator: - """Build the default abstention generator on :mod:`oa` (lazy import).""" - import oa + """Build the default abstention generator on :mod:`aix` (lazy import).""" + import aix - fn = oa.prompt_function( + fn = aix.prompt_func( prompt, egress=_parse_lines, name="abstention", **prompt_function_kwargs ) @@ -287,7 +287,7 @@ def generate_cases( mask_names: scrub the artifact name from the description before generating, and drop any generated intent that still contains it. query_generator: ``(description, *, n) -> [intent, …]``. Defaults to the - :mod:`oa`-backed back-translator (built lazily; needs a model). + :mod:`aix`-backed back-translator (built lazily; needs a model). describe: ``raw -> description`` (default: the ``description`` / ``text`` field, else the joined string fields). min_chars: skip artifacts whose description is shorter than this. @@ -304,7 +304,7 @@ def generate_cases( """ if k < 1: raise ValueError(f"k must be >= 1, got {k!r}.") - gen = query_generator or make_oa_query_generator() + gen = query_generator or make_default_query_generator() describe = describe or _default_describe cases: list[DiscoveryCase] = [] skipped = 0 @@ -368,7 +368,7 @@ def generate_abstention_cases( """Generate ``n`` abstention cases — out-of-scope intents (empty ``gold``).""" if n <= 0: return [] - gen = generator or make_oa_abstention_generator() + gen = generator or make_default_abstention_generator() intents = gen(n=n, theme=theme) cases = [ DiscoveryCase( diff --git a/ir/formulate.py b/ir/formulate.py index 1ed5590..a4a6f66 100644 --- a/ir/formulate.py +++ b/ir/formulate.py @@ -17,7 +17,7 @@ lives in the agent layer (``raglab``), not here. :func:`make_llm_formulator` mirrors :func:`ir.select.make_llm_selector`: an -injectable ``rewriter`` callable, built lazily on :mod:`oa` when omitted (so +injectable ``rewriter`` callable, built lazily on :mod:`aix` when omitted (so importing ir stays offline), falling back to identity on any failure — a formulator must never make retrieval *worse* than the raw query. """ @@ -47,13 +47,13 @@ def identity_formulator(query: str) -> str: def _default_llm_rewriter(prompt: str, n: int, **prompt_function_kwargs: Any): - """Build the default LLM rewriter on :mod:`oa` (lazy import).""" - import oa + """Build the default LLM rewriter on :mod:`aix` (lazy import).""" + import aix def _parse_lines(text: str) -> list[str]: return [line.strip(" -\t") for line in str(text).splitlines() if line.strip()] - fn = oa.prompt_function( + fn = aix.prompt_func( prompt, egress=_parse_lines, name="formulate_queries", **prompt_function_kwargs ) @@ -74,8 +74,8 @@ def make_llm_formulator( """An LLM-backed :data:`Formulator` (rewrite / expand / multi-query). ``rewriter`` is an injectable ``query -> str | [str, ...]`` callable (a test - double, or your own router); when omitted it is built lazily on :mod:`oa` - (``oa.prompt_function``), so importing this module stays offline. ``n`` is the + double, or your own router); when omitted it is built lazily on :mod:`aix` + (``aix.prompt_func``), so importing this module stays offline. ``n`` is the multi-query fan-out width. Any error or empty reply falls back to ``fallback`` (default: :func:`identity_formulator`). """ diff --git a/ir/index.py b/ir/index.py index bc27b82..780f0e9 100644 --- a/ir/index.py +++ b/ir/index.py @@ -73,6 +73,37 @@ def _embed_batched(emb, texts, input_type, batch_size): return np.vstack(out) if out else np.zeros((0, 0), dtype=np.float32) +class _LazyEmbedder: + """An embedder callable that resolves its model on first *call*, not first use. + + :func:`open_corpus` binds this as a :class:`Corpus`'s ``embedder`` so the + (heavy, ~seconds) embedding-model load is paid only when a query is actually + embedded — ``ls`` / ``info`` and lexical-only search, which never embed, + never trigger it. The corpus's ``embedder_id`` is read from stored config, so + it is known without resolving the model. Transparent: it forwards ``__call__`` + (including ``input_type=``) to the resolved embedder, so every existing call + site (``_embed`` and its ``TypeError`` fallback) works unchanged. + """ + + def __init__(self, spec: Any): + self._spec = spec + self._resolved: Callable | None = None + self._resolved_id: str | None = None + + def _resolve(self) -> Callable: + if self._resolved is None: + self._resolved, self._resolved_id = make_embedder(self._spec) + return self._resolved + + def __call__(self, texts, **kwargs): + return self._resolve()(texts, **kwargs) + + @property + def embedder_id(self) -> str: + self._resolve() + return self._resolved_id or "custom" + + @dataclass class Corpus: """A built, queryable corpus: a store plus its embedder.""" @@ -222,9 +253,16 @@ def build( def open_corpus(name: str, *, embedder: Any = None) -> Corpus: - """Reopen a previously built corpus by name (resolves its embedder).""" + """Reopen a previously built corpus by name. + + The embedding model is **lazily** resolved (see :class:`_LazyEmbedder`): the + returned corpus knows its ``embedder_id`` from stored config immediately, but + only loads the model when a dense/hybrid query actually embeds. So ``ir ls``, + ``ir info``, and lexical-only search open a corpus without the model-load + cost. Pass ``embedder=`` to override the stored spec. + """ store = CorpusStore.local(name) cfg = store.get_config() spec = embedder if embedder is not None else cfg.get("embedder_spec", "default") - emb, emb_id = make_embedder(spec) - return Corpus(name, store, emb, cfg.get("embedder_id", emb_id)) + emb_id = cfg.get("embedder_id") or "" + return Corpus(name, store, _LazyEmbedder(spec), emb_id) diff --git a/ir/maintenance.py b/ir/maintenance.py new file mode 100644 index 0000000..b481b69 --- /dev/null +++ b/ir/maintenance.py @@ -0,0 +1,136 @@ +"""Idempotent background-work runner — ``ir maintain`` (issue #58). + +The *executing* half of ir's maintenance story (the declarative half is +:mod:`ir.policy`). :func:`maintain` reads each corpus's resolved +:class:`~ir.policy.MaintenancePolicy` and does the work that is **due**: + +- an **incremental rebuild** when reindex is due (``source-change`` is always + due — the build is a near-no-op when nothing changed; ``interval`` only when + stale; ``manual`` never); +- when ``synopsis.enabled``, the rebuild's strategy is wrapped in + :func:`ir.with_synopsis`, so new / changed artifacts gain an LLM synopsis — + and because that build may call an LLM, it is run **only inside the policy's + downtime window**. + +It is safe to call as often as a scheduler likes: it no-ops when nothing is due, +and records ``last_maintained`` so interval policies converge. ir runs the work; +*scheduling* it is external — a cron / launchd entry calls ``ir maintain --all`` +every N minutes, and the downtime window lives in the policy (data), not in ir. + + # crontab: try the queue every 15 minutes; ir decides what is actually due. + */15 * * * * ir maintain --all >> ~/.cache/ir/maintain.log 2>&1 +""" + +from __future__ import annotations + +from dataclasses import dataclass, replace as dc_replace +from datetime import datetime +from typing import Any + +from . import registry +from .index import build, open_corpus +from .policy import in_downtime, is_reindex_due, resolve_policy + + +@dataclass +class MaintenanceResult: + """What :func:`maintain_corpus` did (or would do) for one corpus.""" + + name: str + ran: bool + reason: str + reindex: bool = False + synopsis: bool = False + records: int | None = None + + def __str__(self) -> str: + verb = "maintained" if self.ran else "skipped" + bits = [b for b, on in (("reindex", self.reindex), ("synopsis", self.synopsis)) if on] + what = "+".join(bits) or "-" + recs = f", {self.records} records" if self.records is not None else "" + return f"{self.name}: {verb} [{what}] ({self.reason}){recs}" + + +def _now(now: datetime | None) -> datetime: + # Local clock: the downtime window is expressed in local hours, and interval + # math stays consistent by using the same clock for both. Injectable for tests. + return now if now is not None else datetime.now() + + +def _parse_iso(s: Any) -> datetime | None: + if not s: + return None + try: + return datetime.fromisoformat(s) + except (TypeError, ValueError): + return None + + +def _synopsis_wrapped(source): + """Return *source* with its strategy wrapped in :func:`ir.with_synopsis`.""" + from .synopsis import with_synopsis + + return dc_replace(source, indexing_strategy=with_synopsis(source.indexing_strategy)) + + +def maintain_corpus( + name: str, *, now: datetime | None = None, dry_run: bool = False, full: bool = True +) -> MaintenanceResult: + """Do the due background work for one corpus (idempotent). + + Reads the corpus's resolved policy and its ``last_maintained`` time, decides + whether a (synopsis-aware) reindex is due *and* permitted now, and — unless + ``dry_run`` — runs the incremental build and records the run. + """ + now = _now(now) + policy = resolve_policy(registry.get(name)) + store = open_corpus(name).store # lazy: opening never loads the model + last = _parse_iso(store.get_maintenance_state().get("last_maintained")) + + due = is_reindex_due(policy, last, now) + synopsis_on = policy.synopsis.enabled + # A synopsis build may hit an LLM, so it is confined to the downtime window. + blocked = synopsis_on and not in_downtime(policy, now) + + if not due or blocked: + reason = ( + "synopsis build deferred to downtime window" + if due and blocked + else f"not due ({policy.reindex.on})" + ) + return MaintenanceResult(name, False, reason, reindex=due, synopsis=synopsis_on) + + if dry_run: + return MaintenanceResult( + name, False, "dry-run (would reindex)", reindex=True, synopsis=synopsis_on + ) + + source = registry.source_for(name) + if synopsis_on: + source = _synopsis_wrapped(source) + built = build(source, full=full) + built.store.set_maintenance_state( + {"last_maintained": now.isoformat(), "synopsis_enabled": synopsis_on} + ) + return MaintenanceResult( + name, True, "reindexed", reindex=True, synopsis=synopsis_on, records=len(built) + ) + + +def maintain( + name: str | None = None, + *, + all: bool = False, + now: datetime | None = None, + dry_run: bool = False, +) -> list[MaintenanceResult]: + """Run due background work for one corpus (*name*) or every registered one (*all*). + + With neither, defaults to all registered corpora. Returns one + :class:`MaintenanceResult` per corpus considered. + """ + if name and not all: + names = [name] + else: + names = list(registry.registered()) + return [maintain_corpus(n, now=now, dry_run=dry_run) for n in names] diff --git a/ir/policy.py b/ir/policy.py new file mode 100644 index 0000000..59582b7 --- /dev/null +++ b/ir/policy.py @@ -0,0 +1,243 @@ +"""Per-corpus policy — how a corpus is segmented/stored and what background work it gets. + +This is the *declarative* half of ir's maintenance story (issue #58): a corpus's +policy lives as **data** in its registry entry, and ir resolves an effective +policy by layering ``entry`` over per-``kind`` defaults over a global default. +An idempotent :func:`ir.maintenance.maintain` reads the policy and does the due +work; *scheduling* that work (cron / launchd, or an orchestration layer's budget +governor) stays **external** — ir never imports an orchestration layer, and there +is no global ``Settings`` singleton (policy is per-corpus data, injected, not a +process-wide mutable). The light path stays a one-liner: every field has a smart +default, so ``ir build skills`` needs no policy at all. + +The three policy axes: + +- **reindex** — *when* to rebuild: ``"source-change"`` (default; rebuild is a + near-no-op when nothing changed, so it is always safe to run), ``"interval"`` + (only when older than ``every_hours``), or ``"manual"`` (never automatic). +- **synopsis** — *whether/when* to attach LLM synopses (the expensive, + off-by-default work): only the ``recent`` slice, only during ``downtime_hours``. + Synopsis is realized as a strategy wrapper (:func:`ir.with_synopsis`), so it is + incremental — only new/changed artifacts are synthesized. +- **storage** — the persistence backend. Today only ``"local"`` (the file store); + a ``vd.Collection`` backend is the documented future seam (issue #28). +""" + +from __future__ import annotations + +from dataclasses import dataclass, field, replace +from datetime import datetime, timedelta +from typing import Any + +# --------------------------------------------------------------------------- # +# Policy data model (immutable; pure data — no I/O, no model loading) +# --------------------------------------------------------------------------- # + +#: Accepted ``reindex.on`` triggers. +REINDEX_TRIGGERS = ("source-change", "interval", "manual") + +#: Storage backends ir can resolve today. ``"vd"`` (vd.Collection) is the +#: documented future backend, gated on issue #28. +STORAGE_BACKENDS = ("local",) + + +@dataclass(frozen=True) +class ReindexPolicy: + """When to (incrementally) rebuild a corpus.""" + + on: str = "source-change" + every_hours: float | None = None # used only when ``on == "interval"`` + + def __post_init__(self): + if self.on not in REINDEX_TRIGGERS: + raise ValueError( + f"unknown reindex trigger {self.on!r}; expected {REINDEX_TRIGGERS}" + ) + + @classmethod + def from_dict(cls, d: dict | None) -> "ReindexPolicy": + d = d or {} + return cls( + on=d.get("on", "source-change"), + every_hours=d.get("every_hours"), + ) + + def to_dict(self) -> dict: + out: dict[str, Any] = {"on": self.on} + if self.every_hours is not None: + out["every_hours"] = self.every_hours + return out + + +@dataclass(frozen=True) +class SynopsisPolicy: + """Whether/when to attach (expensive, LLM-generated) synopses. + + ``downtime_hours`` is a ``[start, end)`` pair of local-clock hours + (wrapping past midnight is allowed, e.g. ``(22, 6)``); ``None`` means + "any time". ``scope="recent"`` limits synthesis to artifacts whose + timestamp is within ``window_days`` (corpora that expose a time signal); + ``scope="all"`` synthesizes every artifact (bounded by incrementality). + """ + + enabled: bool = False + scope: str = "recent" # "recent" | "all" + window_days: int = 30 + downtime_hours: tuple[int, int] | None = None + + @classmethod + def from_dict(cls, d: dict | None) -> "SynopsisPolicy": + d = d or {} + dh = d.get("downtime_hours") + return cls( + enabled=bool(d.get("enabled", False)), + scope=d.get("scope", "recent"), + window_days=int(d.get("window_days", 30)), + downtime_hours=(int(dh[0]), int(dh[1])) if dh else None, + ) + + def to_dict(self) -> dict: + out: dict[str, Any] = { + "enabled": self.enabled, + "scope": self.scope, + "window_days": self.window_days, + } + if self.downtime_hours is not None: + out["downtime_hours"] = list(self.downtime_hours) + return out + + +@dataclass(frozen=True) +class MaintenancePolicy: + """The background-work policy for one corpus (reindex + synopsis).""" + + reindex: ReindexPolicy = field(default_factory=ReindexPolicy) + synopsis: SynopsisPolicy = field(default_factory=SynopsisPolicy) + + @classmethod + def from_dict(cls, d: dict | None) -> "MaintenancePolicy": + d = d or {} + return cls( + reindex=ReindexPolicy.from_dict(d.get("reindex")), + synopsis=SynopsisPolicy.from_dict(d.get("synopsis")), + ) + + def to_dict(self) -> dict: + return {"reindex": self.reindex.to_dict(), "synopsis": self.synopsis.to_dict()} + + def merged(self, override: dict | None) -> "MaintenancePolicy": + """Layer an ``override`` dict on top of this policy (entry over defaults).""" + if not override: + return self + return replace( + self, + reindex=ReindexPolicy.from_dict( + {**self.reindex.to_dict(), **(override.get("reindex") or {})} + ), + synopsis=SynopsisPolicy.from_dict( + {**self.synopsis.to_dict(), **(override.get("synopsis") or {})} + ), + ) + + +# --------------------------------------------------------------------------- # +# Smart defaults per kind — the "rules that lead to defaults" for new corpora +# --------------------------------------------------------------------------- # + +#: The global fallback policy (any kind without a specific default). +GLOBAL_DEFAULT = MaintenancePolicy() + +#: Per-kind maintenance defaults. The rule of thumb encoded here: +#: *small, fully-enumerable* corpora (skills/packages/reports/files) rebuild on +#: source change (cheap, exact); *large, append-mostly, time-stamped* corpora +#: (sessions) rebuild on an interval and keep synopsis off by default, with a +#: downtime window ready for when it is turned on. New kinds register here. +DEFAULTS_BY_KIND: dict[str, dict] = { + "skills": {"reindex": {"on": "source-change"}}, + "packages": {"reindex": {"on": "source-change"}}, + "reports": {"reindex": {"on": "source-change"}}, + "files": {"reindex": {"on": "source-change"}}, + "sessions": { + "reindex": {"on": "interval", "every_hours": 24}, + "synopsis": { + "enabled": False, + "scope": "recent", + "window_days": 30, + "downtime_hours": [2, 6], + }, + }, +} + + +def default_policy_for_kind(kind: str) -> MaintenancePolicy: + """The smart-default :class:`MaintenancePolicy` for a corpus ``kind``.""" + return GLOBAL_DEFAULT.merged(DEFAULTS_BY_KIND.get(kind)) + + +def resolve_policy(entry: dict | None) -> MaintenancePolicy: + """The effective policy for a registry ``entry``: entry over kind over global. + + A v1 entry (no ``maintenance`` key) resolves to its kind's smart default, so + existing corpora gain a sensible policy without a migration. + """ + entry = entry or {} + base = default_policy_for_kind(entry.get("kind", "")) + return base.merged(entry.get("maintenance")) + + +def resolve_storage(entry: dict | None) -> dict: + """The effective storage spec for an ``entry`` (default ``{"backend": "local"}``).""" + storage = dict((entry or {}).get("storage") or {}) + backend = storage.get("backend", "local") + if backend not in STORAGE_BACKENDS: + raise NotImplementedError( + f"storage backend {backend!r} is not available yet (only " + f"{STORAGE_BACKENDS}); a vd.Collection backend is tracked in issue #28." + ) + storage["backend"] = backend + return storage + + +# --------------------------------------------------------------------------- # +# Timing predicates (pure; ``now`` is injected for testability) +# --------------------------------------------------------------------------- # + + +def is_reindex_due( + policy: MaintenancePolicy, + last_maintained: datetime | None, + now: datetime, +) -> bool: + """Whether a reindex is due under ``policy`` given the last-maintained time. + + - ``source-change`` is always due (the build is incremental and a no-op when + nothing changed, so running it cannot do harm). + - ``interval`` is due when never maintained or older than ``every_hours``. + - ``manual`` is never automatically due. + """ + on = policy.reindex.on + if on == "source-change": + return True + if on == "manual": + return False + # interval + every = policy.reindex.every_hours + if not every or last_maintained is None: + return True + return now - last_maintained >= timedelta(hours=every) + + +def in_downtime(policy: MaintenancePolicy, now: datetime) -> bool: + """Whether ``now`` falls inside the synopsis ``downtime_hours`` window. + + ``None`` window means "any time". A window whose start hour is greater than + its end hour wraps past midnight (e.g. ``(22, 6)`` is 22:00–06:00). + """ + dh = policy.synopsis.downtime_hours + if not dh: + return True + start, end = dh + hour = now.hour + if start <= end: + return start <= hour < end + return hour >= start or hour < end # wraps midnight diff --git a/ir/registry.py b/ir/registry.py index 559ade6..2bf3f2d 100644 --- a/ir/registry.py +++ b/ir/registry.py @@ -10,10 +10,11 @@ - ``skills`` → :meth:`CorpusSource.from_skills` - ``packages`` → :meth:`CorpusSource.from_packages` - ``reports`` → :meth:`CorpusSource.from_md_reports` +- ``sessions`` → :meth:`CorpusSource.from_claude_sessions` - ``files`` → :meth:`CorpusSource.from_files` (needs ``root``; optional ``pattern``) -Unregistered preset names (``skills``/``packages``/``reports``) are +Unregistered preset names (``skills``/``packages``/``reports``/``sessions``) are auto-registered with defaults on first use, so ``ir build skills`` just works. """ @@ -26,7 +27,7 @@ from .config import registry_path from .sources import CorpusSource -PRESETS = ("skills", "packages", "reports") +PRESETS = ("skills", "packages", "reports", "sessions") def _load() -> dict[str, Any]: @@ -40,14 +41,58 @@ def _save(entries: dict[str, Any]) -> None: registry_path().write_text(json.dumps(entries, indent=2), encoding="utf-8") -def register(name: str, kind: str, *, embedder: str = "default", **params) -> dict: - """Register (or overwrite) a named corpus definition.""" +def register( + name: str, + kind: str, + *, + embedder: str = "default", + strategy: Any = None, + maintenance: Mapping[str, Any] | None = None, + storage: Mapping[str, Any] | None = None, + **params, +) -> dict: + """Register (or overwrite) a named corpus definition. + + Beyond the v1 ``kind`` / ``embedder`` / ``params``, an entry may now carry + (all optional, with smart per-kind defaults applied at resolution time — see + :mod:`ir.policy`): + + - ``strategy`` — an :class:`~ir.strategy.IndexingStrategy` (or a + ``{"name", "params"}`` spec) persisted so the corpus's *segmentation* is + stable across rebuilds. ``None`` keeps the preset's default strategy. + - ``maintenance`` — the background-work policy dict (``reindex`` / ``synopsis``; + validated here, see :class:`ir.policy.MaintenancePolicy`). + - ``storage`` — the persistence backend (default ``{"backend": "local"}``). + + Entries written by older ``ir`` (none of these keys) keep working unchanged. + """ if kind not in PRESETS and kind != "files": raise ValueError( f"Unknown corpus kind {kind!r}; use one of {PRESETS} or 'files'." ) + from .policy import MaintenancePolicy, resolve_storage + from .strategy import IndexingStrategy, strategy_to_spec + + entry: dict[str, Any] = {"kind": kind, "embedder": embedder, "params": params} + if strategy is not None: + if isinstance(strategy, Mapping): + entry["strategy"] = dict(strategy) + elif isinstance(strategy, IndexingStrategy): + entry["strategy"] = strategy_to_spec(strategy) + else: + raise TypeError( + f"strategy must be an IndexingStrategy or a spec dict, got " + f"{type(strategy).__name__}" + ) + if maintenance is not None: + # Validate (raises on a bad trigger) and store the normalized form. + entry["maintenance"] = MaintenancePolicy.from_dict(dict(maintenance)).to_dict() + if storage is not None: + resolve_storage({"storage": dict(storage)}) # validate backend + entry["storage"] = dict(storage) + entries = _load() - entries[name] = {"kind": kind, "embedder": embedder, "params": params} + entries[name] = entry _save(entries) return entries[name] @@ -70,22 +115,53 @@ def unregister(name: str) -> None: def source_from_entry(name: str, entry: dict) -> CorpusSource: - """Reconstruct a :class:`CorpusSource` from a registry entry.""" + """Reconstruct a :class:`CorpusSource` from a registry entry. + + A persisted ``strategy`` spec (registry v2) is reconstructed and passed to + the preset constructor, so a corpus's segmentation survives across rebuilds. + A v1 entry (no ``strategy``) passes ``strategy=None`` and keeps the preset's + default — unchanged behavior. + """ + from .strategy import strategy_from_spec + kind = entry["kind"] params = dict(entry.get("params", {})) embedder = entry.get("embedder", "default") + strategy = strategy_from_spec(entry.get("strategy")) if kind == "skills": - return CorpusSource.from_skills(name=name, embedder=embedder) + return CorpusSource.from_skills(name=name, embedder=embedder, strategy=strategy) if kind == "packages": - return CorpusSource.from_packages(name=name, embedder=embedder) + return CorpusSource.from_packages( + name=name, embedder=embedder, strategy=strategy + ) if kind == "reports": - return CorpusSource.from_md_reports(name=name, embedder=embedder) + return CorpusSource.from_md_reports( + name=name, embedder=embedder, strategy=strategy + ) + if kind == "sessions": + return CorpusSource.from_claude_sessions( + name=name, embedder=embedder, strategy=strategy, **params + ) if kind == "files": root = params.pop("root") - return CorpusSource.from_files(root, name=name, embedder=embedder, **params) + return CorpusSource.from_files( + root, name=name, embedder=embedder, strategy=strategy, **params + ) raise ValueError(f"Unknown corpus kind {kind!r}.") +def policy_for(name: str): + """The effective :class:`ir.policy.MaintenancePolicy` for corpus *name*. + + Resolves the registered entry's ``maintenance`` over its kind's smart default + over the global default (see :func:`ir.policy.resolve_policy`). An unregistered + name resolves to the global default policy. + """ + from .policy import resolve_policy + + return resolve_policy(get(name)) + + def source_for(name: str) -> CorpusSource: """Resolve *name* to a source, auto-registering a preset if needed.""" entry = get(name) diff --git a/ir/retrieve.py b/ir/retrieve.py index 3b8c7a4..6af1484 100644 --- a/ir/retrieve.py +++ b/ir/retrieve.py @@ -414,7 +414,13 @@ def search( return merged[:k] query = queries[0] - ids, mat, metas = corpus.store.matrix() + # Lexical ranking scores on text alone, so it must not pay the embedding + # matrix's I/O; dense/hybrid need the matrix. (Both reuse the same cache.) + if mode == "lexical": + ids, metas = corpus.store.metas() + mat = None + else: + ids, mat, metas = corpus.store.matrix() if not ids: return [] diff --git a/ir/select.py b/ir/select.py index fd25976..f686868 100644 --- a/ir/select.py +++ b/ir/select.py @@ -444,7 +444,7 @@ def make_llm_selector( The model reads the candidates' descriptions and commits to a subset. ``chooser`` is an injectable ``(query, candidates) -> [id, …]`` callable (a test double, or your own router); when omitted it is built lazily on - :mod:`oa` (``oa.prompt_function``), so importing this module stays offline. + :mod:`aix` (``aix.prompt_func``), so importing this module stays offline. Robustness: any error or empty/garbled reply falls back to ``fallback`` (default: the ``"conservative"`` heuristic), because LLM selection is known @@ -490,13 +490,13 @@ def selector(hits: Sequence[SearchHit]) -> list[SearchHit]: def _default_llm_chooser(prompt: str, **prompt_function_kwargs: Any): - """Build the default LLM chooser on :mod:`oa` (lazy import).""" - import oa + """Build the default LLM chooser on :mod:`aix` (lazy import).""" + import aix def _parse_ids(text: str) -> list[str]: return [line.strip(" -\t") for line in str(text).splitlines() if line.strip()] - fn = oa.prompt_function( + fn = aix.prompt_func( prompt, egress=_parse_ids, name="select_capabilities", **prompt_function_kwargs ) diff --git a/ir/sources.py b/ir/sources.py index dc63115..65db925 100644 --- a/ir/sources.py +++ b/ir/sources.py @@ -26,10 +26,20 @@ from pathlib import Path from typing import Any -from .strategy import Chunked, IndexingStrategy, Package, Skill, WholeText +from .strategy import ( + Chunked, + ClaudeTurn, + IndexingStrategy, + Package, + Skill, + WholeText, +) ALLCAPS_MD = re.compile(r"^[A-Z0-9_ ]+\.md$") +#: Default look-back window (days) for :meth:`CorpusSource.from_claude_sessions`. +DFLT_SESSIONS_SINCE_DAYS = 90 + def content_hash_signal(artifact_id: str, raw: Any) -> str: """Default change signal: a content hash of the raw payload.""" @@ -193,6 +203,67 @@ def metadata_of(aid, raw): **kwargs, ) + @classmethod + def from_claude_sessions( + cls, + *, + name: str = "sessions", + since: float | None = DFLT_SESSIONS_SINCE_DAYS, + projects: Any = None, + include_full: bool = False, + include_session_title: bool = True, + max_sessions: int | None = None, + root: str | Path | None = None, + fetcher: Callable[[], list] | None = None, + strategy: IndexingStrategy | None = None, + **kwargs, + ) -> "CorpusSource": + """The user's Claude Code session transcripts as a corpus (turn pairs). + + Each artifact is one user→assistant turn pair; the default + :class:`~ir.strategy.ClaudeTurn` strategy indexes the user prompt and the + assistant's end-of-turn summary as separate surfaces (target either with + ``surfaces={"user_prompt"}`` / ``{"assistant_summary"}``). ``include_full`` + adds the full assistant text surface (off by default — the summary is the + signal). ``include_session_title`` (default on) also indexes one record per + session whose surface is the session's persisted custom/AI title — a cheap + "what was this session about" surface. Scope defaults to the last ``since`` + days (a full-history build is heavy); narrow with ``projects`` (a cwd + substring or list) and ``max_sessions``. + + ``fetcher`` overrides the record source (each a mapping with + ``user_prompt`` / ``assistant_summary`` / ... ) — inject a test double to + avoid the ``priv`` dependency. Otherwise records come from + :func:`priv.claude_transcripts.turn_pair_records`. + """ + if fetcher is not None: + records = list(fetcher()) + else: + from priv.claude_transcripts import turn_pair_records + + records = list( + turn_pair_records( + root=root, + since=since, + projects=projects, + max_sessions=max_sessions, + include_session_title=include_session_title, + ) + ) + # Collision-safe scope: ids are session_id:user_uuid (already unique); a + # rare missing id falls back to enumeration so no pair is silently dropped. + scope: dict[str, dict] = {} + for i, r in enumerate(records): + key = r.get("id") or f"turn_{i}" + scope[key] = r + + return cls( + name=name, + scope=scope, + indexing_strategy=strategy or ClaudeTurn(include_full=include_full), + **kwargs, + ) + @classmethod def from_packages( cls, diff --git a/ir/store.py b/ir/store.py index e7a72d5..d8cee0c 100644 --- a/ir/store.py +++ b/ir/store.py @@ -30,14 +30,20 @@ import copy import io +import json import os from collections.abc import Iterator, Mapping, MutableMapping +from pathlib import Path from typing import Any import numpy as np from .base import Record +#: Bump when the on-disk packed-matrix layout changes so a stale cache from an +#: older ``ir`` is treated as invalid (rebuilt) rather than mis-read. +_PACKED_FORMAT = 1 + def _ndarray_store(rootdir) -> MutableMapping[str, np.ndarray]: """A ``dol`` file store whose values are float32 ``ndarray``s.""" @@ -78,6 +84,8 @@ def __init__( config: MutableMapping[str, Any], calibration: MutableMapping[str, Any] | None = None, links: MutableMapping[str, Any] | None = None, + *, + packed_dir: str | Path | None = None, ): self.meta = meta self.vectors = vectors @@ -89,6 +97,13 @@ def __init__( self.calibration = {} if calibration is None else calibration self.links = {} if links is None else links self._matrix_cache: tuple | None = None + # Optional on-disk packed-matrix cache (a single normalized matrix + its + # ids/metas as three files), so reopening a corpus skips the per-record + # vector-file storm. ``None`` (e.g. for the in-memory store) keeps the + # matrix purely in-process, preserving the original behavior. The cache + # is a *write-invalidated read cache*: any record write clears it. + self._packed_dir = Path(packed_dir) if packed_dir is not None else None + self._packed_stale = False # ----- factories ------------------------------------------------------ # @@ -105,6 +120,7 @@ def local(cls, name: str) -> "CorpusStore": config=_json_store(root / "config"), calibration=_json_store(root / "calibration"), links=_json_store(root / "links"), + packed_dir=root / "matrix", ) @classmethod @@ -124,7 +140,7 @@ def put_record(self, record: Record) -> None: "metadata": dict(record.metadata), } self.vectors[record.id] = np.asarray(record.vector, dtype=np.float32) - self._matrix_cache = None + self._invalidate_matrix() def delete_record(self, record_id: str) -> None: """Remove a record's metadata + vector; a missing id is tolerated.""" @@ -133,7 +149,7 @@ def delete_record(self, record_id: str) -> None: del self.vectors[record_id] except KeyError: pass - self._matrix_cache = None + self._invalidate_matrix() def record_ids(self) -> Iterator[str]: """Iterate the record ids currently stored.""" @@ -185,6 +201,19 @@ def set_config(self, settings: Mapping[str, Any]) -> None: """Persist the corpus build *settings* (name / embedder spec + id).""" self.config["config"] = dict(settings) + def get_maintenance_state(self) -> dict: + """Background-work bookkeeping (e.g. ``last_maintained``); ``{}`` if unset. + + Kept under a separate ``config``-view key from the build settings: it is + regenerable scheduler state (when ``ir maintain`` last ran), not part of + the corpus's build identity, so it must never clobber it. + """ + return dict(self.config.get("maintenance", {})) + + def set_maintenance_state(self, state: Mapping[str, Any]) -> None: + """Persist the maintenance bookkeeping for this corpus.""" + self.config["maintenance"] = dict(state) + # ----- calibration (per-mode) ----------------------------------------- # def get_calibration(self, mode: str) -> dict | None: @@ -251,19 +280,144 @@ def matrix(self) -> tuple[list[str], np.ndarray, list[dict]]: """Return ``(record_ids, normalized_matrix, metas)`` for brute force. Rows are L2-normalized so cosine similarity is a dot product. Empty - corpora return a ``(0, 0)`` matrix. Cached until the next write. + corpora return a ``(0, 0)`` matrix. + + Caching is two-tier: an in-process cache (invalidated on the next write) + backed, for file-rooted stores, by an on-disk **packed** cache — one + normalized-matrix ``.npy`` plus its ids/metas, written once and reloaded + with a single memory-mapped read. The packed cache turns a cold reopen + from a per-record vector-file storm (thousands of tiny reads) into three + file reads; it is cleared by any record write, so it never goes stale. """ if self._matrix_cache is not None: return self._matrix_cache + packed = self._load_packed() + if packed is not None: + self._matrix_cache = packed + return packed + result = self._build_matrix() + self._save_packed(result) + self._matrix_cache = result + return result + + def metas(self) -> tuple[list[str], list[dict]]: + """Return ``(record_ids, metas)`` **without** loading any vectors. + + The vector-free counterpart of :meth:`matrix`, for ranking modes that + score on text alone (``mode="lexical"``): they need candidate metadata + (text + filter fields) but never the embedding matrix, so they must not + pay its I/O. Reuses the in-process or packed cache when present; else + reads only the ``meta`` view (not ``vectors``). + """ + if self._matrix_cache is not None: + ids, _mat, metas = self._matrix_cache + return ids, metas + packed = self._load_packed() + if packed is not None: + self._matrix_cache = packed + return packed[0], packed[2] + ids = list(self.meta) + metas = [self.meta[rid] for rid in ids] + return ids, metas + + def _build_matrix(self) -> tuple[list[str], np.ndarray, list[dict]]: + """Build ``(ids, normalized_matrix, metas)`` from the per-record stores. + + One pass over the ids reads each record's meta and vector together + (the previous implementation iterated the meta view three times). + """ ids = list(self.meta) if not ids: - self._matrix_cache = ([], np.zeros((0, 0), dtype=np.float32), []) - return self._matrix_cache - rows = [np.asarray(self.vectors[rid], dtype=np.float32) for rid in ids] + return ([], np.zeros((0, 0), dtype=np.float32), []) + metas: list[dict] = [] + rows: list[np.ndarray] = [] + for rid in ids: + metas.append(self.meta[rid]) + rows.append(np.asarray(self.vectors[rid], dtype=np.float32)) mat = np.vstack(rows) norms = np.linalg.norm(mat, axis=1, keepdims=True) norms[norms == 0] = 1.0 mat = mat / norms - metas = [self.meta[rid] for rid in ids] - self._matrix_cache = (ids, mat, metas) - return self._matrix_cache + return (ids, mat, metas) + + # ----- packed-matrix disk cache --------------------------------------- # + + def _invalidate_matrix(self) -> None: + """Drop the in-process matrix and clear the on-disk packed cache once. + + Called on every record write. The on-disk clear happens at most once per + rebuild (guarded by ``_packed_stale``) so a bulk build's thousands of + ``put_record`` calls don't each touch the filesystem. + """ + self._matrix_cache = None + if self._packed_dir is not None and not self._packed_stale: + self._clear_packed() + self._packed_stale = True + + def _packed_paths(self): + d = self._packed_dir + # ``sig`` is written last and removed first, so a half-written or + # half-cleared cache (no/sig-less dir) always reads as invalid. + return { + "sig": d / "sig.json", + "matrix": d / "matrix.npy", + "ids": d / "ids.json", + "metas": d / "metas.json", + } + + def _clear_packed(self) -> None: + if self._packed_dir is None: + return + paths = self._packed_paths() + for key in ("sig", "matrix", "ids", "metas"): # sig first + try: + paths[key].unlink() + except OSError: + pass + + def _load_packed(self): + """Load ``(ids, mmap_matrix, metas)`` from the packed cache, or ``None``.""" + if self._packed_dir is None: + return None + paths = self._packed_paths() + if not paths["sig"].exists(): + return None + try: + sig = json.loads(paths["sig"].read_text(encoding="utf-8")) + if sig.get("format") != _PACKED_FORMAT: + return None + mat = np.load(paths["matrix"], mmap_mode="r") + ids = json.loads(paths["ids"].read_text(encoding="utf-8")) + metas = json.loads(paths["metas"].read_text(encoding="utf-8")) + except (OSError, ValueError): + return None + if len(ids) != mat.shape[0] or len(metas) != len(ids): + return None + return (ids, mat, metas) + + def _save_packed(self, result: tuple[list[str], np.ndarray, list[dict]]) -> None: + """Persist a freshly built matrix to the packed cache (best-effort). + + Skips empty corpora. Writes ``sig.json`` last so a crash mid-write + leaves the cache marked invalid (no sig) rather than torn. + """ + if self._packed_dir is None: + return + ids, mat, metas = result + if not ids: + return + try: + self._packed_dir.mkdir(parents=True, exist_ok=True) + paths = self._packed_paths() + np.save(paths["matrix"], np.asarray(mat, dtype=np.float32)) + paths["ids"].write_text(json.dumps(ids), encoding="utf-8") + paths["metas"].write_text(json.dumps(metas), encoding="utf-8") + paths["sig"].write_text( + json.dumps({"format": _PACKED_FORMAT, "count": len(ids)}), + encoding="utf-8", + ) + self._packed_stale = False + except OSError: + # A read cache that can't be written is non-fatal: fall back to the + # in-process cache (already set by the caller) for this process. + self._clear_packed() diff --git a/ir/strategy.py b/ir/strategy.py index c6bcdcc..62c327e 100644 --- a/ir/strategy.py +++ b/ir/strategy.py @@ -184,6 +184,53 @@ def decompose(self, artifact_id, raw, metadata=None) -> IndexPlan: return IndexPlan(filter_fields=filter_fields, surfaces=surfaces) +#: Shipped strategies addressable by name, for persisting an +#: :class:`IndexingStrategy` in a registry entry and reconstructing it (#58). +#: New shipped strategies register here so a corpus can name its segmentation. +STRATEGY_REGISTRY: dict[str, type] = { + "WholeText": WholeText, + "Chunked": Chunked, + "Skill": Skill, +} + + +def strategy_to_spec(strategy: Any) -> dict: + """A ``{"name", "params"}`` spec for a shipped, scalar-param strategy. + + Captures only scalar constructor parameters (the same identity surface + :func:`ir.index._strategy_id` stamps), so it round-trips the shipped + strategies. A custom strategy, or one wrapping another (e.g. the + :func:`ir.with_synopsis` wrapper), is **not** captured here — those are set + programmatically at build time / by the maintenance layer, not persisted as a + segmentation spec. + """ + name = type(strategy).__name__ + params = { + k: v + for k, v in vars(strategy).items() + if isinstance(v, (str, int, float, bool, type(None))) + } + return {"name": name, "params": params} + + +def strategy_from_spec(spec: Mapping[str, Any] | None) -> "IndexingStrategy | None": + """Reconstruct a shipped strategy from a ``{"name", "params"}`` spec. + + ``None`` (no persisted strategy) returns ``None`` so the caller falls back to + the source preset's default strategy — the back-compatible behavior for v1 + registry entries. + """ + if not spec: + return None + name = spec.get("name") + cls = STRATEGY_REGISTRY.get(name) + if cls is None: + raise ValueError( + f"unknown strategy {name!r}; known: {sorted(STRATEGY_REGISTRY)}" + ) + return cls(**dict(spec.get("params") or {})) + + class Package: """Package strategy: ``name + description`` surface plus README chunks. @@ -242,3 +289,74 @@ def decompose(self, artifact_id, raw, metadata=None) -> IndexPlan: # Drop empty surfaces (e.g. no description and no README). surfaces = [s for s in surfaces if s.text.strip()] return IndexPlan(filter_fields=filter_fields, surfaces=surfaces) + + +class ClaudeTurn: + """Index a Claude Code session turn-pair: user prompt + assistant summary. + + Two surfaces by default — ``user_prompt`` (what the human asked) and + ``assistant_summary`` (the assistant's *final* end-of-turn text, the + highest-signal "here's what I did"; the deliberation before it is mostly + noise) — so a query can target either side via ``surfaces={"user_prompt"}`` / + ``{"assistant_summary"}``. With ``include_full=True`` a third + ``assistant_full`` surface (all the turn's assistant natural-language text) + trades noise for recall — off by default. Session / project / time / model / + tool-use become hard-filter fields. + + The raw artifact is a turn-pair record (see + :func:`priv.claude_transcripts.turn_pair_records`): a mapping with + ``user_prompt`` / ``assistant_summary`` / ``assistant_full`` plus metadata. + """ + + def __init__(self, *, include_full: bool = False): + self.include_full = include_full + + def decompose(self, artifact_id, raw, metadata=None) -> IndexPlan: + meta = dict(metadata or {}) + raw = raw if isinstance(raw, Mapping) else {} + user = str(raw.get("user_prompt", "") or "").strip() + summary = str(raw.get("assistant_summary", "") or "").strip() + full = str(raw.get("assistant_full", "") or "").strip() + title = str(raw.get("session_title", "") or "").strip() + filter_fields = { + "session_id": raw.get("session_id"), + "project": raw.get("project"), + "cwd": raw.get("cwd"), + "git_branch": raw.get("git_branch"), + "timestamp": raw.get("timestamp"), + "model": raw.get("model"), + "has_tool_use": bool(raw.get("has_tool_use")), + "session_title": raw.get("session_title"), + **meta, + } + surfaces = [] + # A dedicated per-session title record indexes just the title (a cheap, + # searchable "what was this session about" surface). + if raw.get("record_type") == "session_title": + if title: + surfaces.append( + Surface(artifact_id, "session_title", title, granularity="field") + ) + return IndexPlan(filter_fields=filter_fields, surfaces=surfaces) + if user: + surfaces.append( + Surface(artifact_id, "user_prompt", user, granularity="field") + ) + if summary: + surfaces.append( + Surface( + artifact_id, "assistant_summary", summary, granularity="field" + ) + ) + if self.include_full and full and full != summary: + surfaces.append( + Surface( + artifact_id, "assistant_full", full, granularity="document" + ) + ) + return IndexPlan(filter_fields=filter_fields, surfaces=surfaces) + + +# Register the strategies defined after STRATEGY_REGISTRY's literal above. +STRATEGY_REGISTRY["Package"] = Package +STRATEGY_REGISTRY["ClaudeTurn"] = ClaudeTurn diff --git a/ir/synopsis.py b/ir/synopsis.py index c325b18..c6ca6ee 100644 --- a/ir/synopsis.py +++ b/ir/synopsis.py @@ -19,9 +19,10 @@ a synth that returns ``""`` simply leaves the artifact with its other surfaces. ``synthesize: Callable[[Artifact], str]`` is **injectable** (a test double, or -your own summarizer); omitted, it is built lazily on :mod:`oa` via -:func:`make_llm_synthesizer` (the ``make_llm_*`` idiom — ``import ir`` stays -offline, ``oa`` is imported only on the first synthesis). +your own summarizer); omitted, it is built lazily on :mod:`aix` (the +multi-provider LLM facade) via :func:`make_llm_synthesizer` (the ``make_llm_*`` +idiom — ``import ir`` stays offline, ``aix`` is imported only on the first +synthesis). **Staleness.** The wrapper exposes its identity as scalar attributes (``synthesizer_id``, ``synopsis_kind``) and holds the inner strategy, so @@ -79,13 +80,13 @@ def _prompt_hash(prompt: str) -> str: def _default_llm_summarizer( prompt: str, model: str | None, **prompt_function_kwargs: Any ): - """Build the default text→synopsis summarizer on :mod:`oa` (lazy import).""" - import oa + """Build the default text→synopsis summarizer on :mod:`aix` (lazy import).""" + import aix kwargs = dict(prompt_function_kwargs) if model is not None: kwargs.setdefault("model", model) - fn = oa.prompt_function(prompt, name="synthesize_synopsis", **kwargs) + fn = aix.prompt_func(prompt, name="synthesize_synopsis", **kwargs) def summarize(text: str) -> str: return str(fn(text=text) or "").strip() @@ -105,8 +106,8 @@ def make_llm_synthesizer( """An LLM-backed :data:`Synthesizer` (:class:`~ir.base.Artifact` → synopsis). ``summarize`` is an injectable ``text -> str`` callable (a test double, or - your own summarizer); when omitted it is built lazily on :mod:`oa` - (``oa.prompt_function``) on the **first** synthesis and reused — so importing + your own summarizer); when omitted it is built lazily on :mod:`aix` + (``aix.prompt_func``) on the **first** synthesis and reused — so importing this module, and even constructing the synthesizer, stays offline. The artifact's text is extracted with :func:`ir.strategy.text_of` using ``text_key`` — which :func:`with_synopsis` threads from the inner strategy, so @@ -115,7 +116,7 @@ def make_llm_synthesizer( fabricated summary). The returned callable carries a ``synthesizer_id`` attribute (default - ``"oa:{model}:{sha(prompt)[:12]}"``) that :func:`with_synopsis` reads into the + ``"aix:{model}:{sha(prompt)[:12]}"``) that :func:`with_synopsis` reads into the corpus's ``strategy_id`` for staleness — a prompt or model change re-synthesizes. """ cache: dict[str, Callable[[str], str]] = {} @@ -140,7 +141,7 @@ def synthesize(artifact: Artifact) -> str: return out.strip() if isinstance(out, str) else "" synthesize.synthesizer_id = ( - synthesizer_id or f"oa:{model or 'default'}:{_prompt_hash(prompt)}" + synthesizer_id or f"aix:{model or 'default'}:{_prompt_hash(prompt)}" ) return synthesize @@ -229,7 +230,7 @@ def with_synopsis( strategy: the inner :class:`~ir.strategy.IndexingStrategy` (``Chunked``, ``Package``, ...). Its surfaces are kept; the synopsis is prepended. synthesize: an injectable ``Artifact -> str`` (test double / custom - summarizer). Omitted → :func:`make_llm_synthesizer` (lazy ``oa``). + summarizer). Omitted → :func:`make_llm_synthesizer` (lazy ``aix``). synthesizer_id: explicit identity stamp for staleness (recommended when injecting an unnamed callable / lambda). Omitted → the synthesizer's own ``synthesizer_id`` / ``__qualname__``. diff --git a/pyproject.toml b/pyproject.toml index dbaf918..6935a40 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,7 +51,7 @@ docs = [ "sphinx-rtd-theme>=1.0", ] llm = [ - "oa", + "aix", ] [tool.ruff] diff --git a/tests/test_eval_gen.py b/tests/test_eval_gen.py index 0627ab4..cb0c3ad 100644 --- a/tests/test_eval_gen.py +++ b/tests/test_eval_gen.py @@ -2,7 +2,7 @@ The LLM is injected (`query_generator` / `abstention_generator`), so masking, the leakage guard, gold assignment, the abstention fraction, and end-to-end -scorability are all tested without `oa` or a model. +scorability are all tested without `aix` or a model. """ import warnings @@ -372,31 +372,37 @@ def test_build_eval_set_forwards_mask_names_false(): # --------------------------------------------------------------------------- # -# Default oa-backed generators — prompt assembly (skipped if oa absent) +# Default aix-backed generators — prompt assembly + egress wiring +# (mock aix's chat so no LLM/network is touched; skipped if aix absent) # --------------------------------------------------------------------------- # -def test_oa_prompts_substitute_placeholders(): - oa = pytest.importorskip("oa") - # prompt-only (prompt_func=None): assert on the ASSEMBLED PROMPT, not the - # generate() wrapper (which, with prompt_func=None, would iterate the string). - bt = oa.prompt_function(eg.BACKTRANSLATION_PROMPT, name="bt", prompt_func=None) - prompt = bt(description="DESC_MARKER", n=4) - assert "DESC_MARKER" in prompt and "Write 4 natural" in prompt - ab = oa.prompt_function(eg.ABSTENTION_PROMPT, name="ab", prompt_func=None) - assert "THEME_MARKER" in ab(theme="THEME_MARKER", n=2) +def test_default_generator_prompts_substitute_placeholders(monkeypatch): + aix_prompts = pytest.importorskip("aix.prompts") + captured = {} + def fake_chat(prompt, **kwargs): + captured["prompt"] = prompt + return "q1\nq2\nq3\nq4" -def test_parse_lines_is_wired_as_egress(): - oa = pytest.importorskip("oa") + monkeypatch.setattr(aix_prompts, "chat", fake_chat) + gen = eg.make_default_query_generator() + out = gen("DESC_MARKER", n=4) + assert "DESC_MARKER" in captured["prompt"] and "Write 4 natural" in captured["prompt"] + assert out == ["q1", "q2", "q3", "q4"] - def fake_llm(*args, **kwargs): - return "1. foo\n- bar" + monkeypatch.setattr(aix_prompts, "chat", lambda p, **k: captured.update(prompt=p) or "t") + eg.make_default_abstention_generator()(n=2, theme="THEME_MARKER") + assert "THEME_MARKER" in captured["prompt"] - fn = oa.prompt_function( - "x {description} {n}", egress=eg._parse_lines, prompt_func=fake_llm - ) - assert fn(description="d", n=2) == ["foo", "bar"] + +def test_parse_lines_is_wired_as_egress(monkeypatch): + aix_prompts = pytest.importorskip("aix.prompts") + # The default generator's egress (_parse_lines) strips list markers like + # "1. " / "- " from the LLM text. + monkeypatch.setattr(aix_prompts, "chat", lambda p, **k: "1. foo\n- bar") + gen = eg.make_default_query_generator() + assert gen("d", n=2) == ["foo", "bar"] # --------------------------------------------------------------------------- # diff --git a/tests/test_formulate.py b/tests/test_formulate.py index e91772a..d35feb6 100644 --- a/tests/test_formulate.py +++ b/tests/test_formulate.py @@ -2,7 +2,7 @@ Identity by default (search unchanged); a single rewrite redirects retrieval; multi-query fan-out unions+fuses; the LLM formulator is injectable and falls back -to identity on any failure. Hermetic: the light embedder, no ``oa``. +to identity on any failure. Hermetic: the light embedder, no ``aix``. """ import ir @@ -94,8 +94,8 @@ def test_make_llm_formulator_falls_back_on_empty_reply(): assert make_llm_formulator(rewriter=lambda q: [])("deploy") == "deploy" -def test_make_llm_formulator_with_injected_rewriter_needs_no_oa(): - # The injected-rewriter path must not touch the lazy oa builder; a single-str +def test_make_llm_formulator_with_injected_rewriter_needs_no_aix(): + # The injected-rewriter path must not touch the lazy aix builder; a single-str # rewrite is normalized to a one-element list (a valid Formulator output). f = make_llm_formulator(rewriter=lambda q: q) assert f("anything") == ["anything"] diff --git a/tests/test_lazy_open.py b/tests/test_lazy_open.py new file mode 100644 index 0000000..b3c0962 --- /dev/null +++ b/tests/test_lazy_open.py @@ -0,0 +1,71 @@ +"""``open_corpus`` defers the embedding-model load until a query embeds (#56). + +The model load is the dominant per-process cost; ``ls`` / ``info`` and +lexical-only search never embed, so they must not pay it. A corpus knows its +``embedder_id`` from stored config immediately, but resolves the model lazily. +""" + +import numpy as np + +import ir +import ir.index as idx +from ir.index import _LazyEmbedder + + +def test_lazy_embedder_resolves_only_on_first_call(monkeypatch): + calls = [] + + def fake_make_embedder(spec, **kw): + calls.append(spec) + + def emb(texts, **kwargs): + return np.ones((len(list(texts)), 4), dtype=np.float32) + + return emb, "fake-id" + + monkeypatch.setattr(idx, "make_embedder", fake_make_embedder) + + lazy = _LazyEmbedder("default") + assert calls == [] # constructing does not resolve the model + + out = lazy(["x"]) # first call resolves + assert calls == ["default"] + assert out.shape == (1, 4) + + lazy(["y"]) # cached: no second resolve + assert calls == ["default"] + assert lazy.embedder_id == "fake-id" + + +def test_open_corpus_does_not_load_model_for_len_or_lexical(tmp_path, monkeypatch): + monkeypatch.setenv("IR_DATA_DIR", str(tmp_path / "data")) + monkeypatch.setenv("IR_CONFIG_DIR", str(tmp_path / "config")) + monkeypatch.setenv("IR_CACHE_DIR", str(tmp_path / "cache")) + + src = ir.CorpusSource.from_mapping( + {"a": "alpha apple avocado", "b": "beta banana blueberry"}, + name="lz", + strategy=ir.WholeText(), + ) + ir.build(src, embedder="light") # builds into the tmp data dir + + # Count model resolutions from here on (build's own load is already done). + real = idx.make_embedder + calls = [] + + def counting(spec, **kw): + calls.append(spec) + return real(spec, **kw) + + monkeypatch.setattr(idx, "make_embedder", counting) + + corpus = ir.open_corpus("lz") + assert corpus.embedder_id # known from config without resolving the model + assert len(corpus) == 2 + assert calls == [] # opening + len(): no model load + + corpus.search("alpha", mode="lexical") + assert calls == [] # lexical ranks on text alone: still no model load + + corpus.search("alpha", mode="dense") + assert len(calls) == 1 # dense embeds the query: resolves exactly once diff --git a/tests/test_policy.py b/tests/test_policy.py new file mode 100644 index 0000000..c982e07 --- /dev/null +++ b/tests/test_policy.py @@ -0,0 +1,200 @@ +"""Per-corpus policy + registry v2 strategy persistence + idempotent maintain (#58).""" + +from datetime import datetime, timedelta + +import pytest + +import ir +from ir import policy as P +from ir.policy import MaintenancePolicy, in_downtime, is_reindex_due, resolve_policy +from ir.strategy import Chunked, strategy_from_spec, strategy_to_spec + +# --------------------------------------------------------------------------- # +# Strategy spec round-trip (the v1 gap: registry could not persist a strategy) +# --------------------------------------------------------------------------- # + + +def test_strategy_spec_roundtrip(): + spec = strategy_to_spec(Chunked(chunk_size=800, overlap=100)) + assert spec["name"] == "Chunked" + assert spec["params"]["chunk_size"] == 800 + s2 = strategy_from_spec(spec) + assert isinstance(s2, Chunked) and s2.chunk_size == 800 and s2.overlap == 100 + + +def test_strategy_from_spec_none_is_none(): + assert strategy_from_spec(None) is None + + +def test_strategy_from_spec_unknown_raises(): + with pytest.raises(ValueError, match="unknown strategy"): + strategy_from_spec({"name": "Nope", "params": {}}) + + +# --------------------------------------------------------------------------- # +# Policy resolution + smart per-kind defaults +# --------------------------------------------------------------------------- # + + +def test_sessions_kind_default_is_interval_with_downtime(): + pol = P.default_policy_for_kind("sessions") + assert pol.reindex.on == "interval" and pol.reindex.every_hours == 24 + assert pol.synopsis.enabled is False + assert pol.synopsis.downtime_hours == (2, 6) + + +def test_small_corpus_kinds_default_to_source_change(): + for kind in ("skills", "packages", "reports", "files"): + assert P.default_policy_for_kind(kind).reindex.on == "source-change" + + +def test_entry_maintenance_overrides_kind_default(): + pol = resolve_policy( + {"kind": "sessions", "maintenance": {"synopsis": {"enabled": True}}} + ) + assert pol.synopsis.enabled is True # overridden + assert pol.reindex.on == "interval" # kept from the kind default + + +def test_v1_entry_resolves_to_kind_default(): + assert resolve_policy({"kind": "skills"}).reindex.on == "source-change" + + +# --------------------------------------------------------------------------- # +# Timing predicates +# --------------------------------------------------------------------------- # + + +def test_source_change_is_always_due(): + assert is_reindex_due(MaintenancePolicy(), None, datetime(2026, 6, 16, 12)) is True + + +def test_interval_due_only_when_stale(): + pol = resolve_policy({"kind": "sessions"}) # interval 24h + now = datetime(2026, 6, 16, 12) + assert is_reindex_due(pol, None, now) is True + assert is_reindex_due(pol, now - timedelta(hours=1), now) is False + assert is_reindex_due(pol, now - timedelta(hours=25), now) is True + + +def test_manual_is_never_due(): + pol = resolve_policy({"kind": "x", "maintenance": {"reindex": {"on": "manual"}}}) + assert is_reindex_due(pol, None, datetime(2026, 6, 16, 12)) is False + + +def test_downtime_window_and_midnight_wrap(): + day = resolve_policy( + {"kind": "s", "maintenance": {"synopsis": {"downtime_hours": [2, 6]}}} + ) + assert in_downtime(day, datetime(2026, 6, 16, 3)) is True + assert in_downtime(day, datetime(2026, 6, 16, 7)) is False + wrap = resolve_policy( + {"kind": "s", "maintenance": {"synopsis": {"downtime_hours": [22, 6]}}} + ) + assert in_downtime(wrap, datetime(2026, 6, 16, 23)) is True + assert in_downtime(wrap, datetime(2026, 6, 16, 5)) is True + assert in_downtime(wrap, datetime(2026, 6, 16, 12)) is False + + +def test_no_downtime_window_is_always(): + assert in_downtime(MaintenancePolicy(), datetime(2026, 6, 16, 12)) is True + + +# --------------------------------------------------------------------------- # +# Registry v2 persistence (isolated config dir) +# --------------------------------------------------------------------------- # + + +def test_register_persists_and_reconstructs_strategy(tmp_path, monkeypatch): + monkeypatch.setenv("IR_CONFIG_DIR", str(tmp_path / "cfg")) + ir.register( + "notes", + "files", + root=str(tmp_path), + strategy=Chunked(chunk_size=900), + maintenance={"reindex": {"on": "interval", "every_hours": 12}}, + ) + entry = ir.corpora()["notes"] + assert entry["strategy"]["params"]["chunk_size"] == 900 + assert entry["maintenance"]["reindex"]["every_hours"] == 12 + + from ir.registry import source_from_entry + + src = source_from_entry("notes", entry) + assert isinstance(src.indexing_strategy, Chunked) + assert src.indexing_strategy.chunk_size == 900 + + +def test_register_rejects_bad_reindex_trigger(tmp_path, monkeypatch): + monkeypatch.setenv("IR_CONFIG_DIR", str(tmp_path / "cfg")) + with pytest.raises(ValueError, match="unknown reindex trigger"): + ir.register("x", "skills", maintenance={"reindex": {"on": "whenever"}}) + + +# --------------------------------------------------------------------------- # +# maintain: builds when due, no-ops within the interval (idempotent) +# --------------------------------------------------------------------------- # + + +def test_maintain_builds_then_noops_within_interval(tmp_path, monkeypatch): + monkeypatch.setenv("IR_CONFIG_DIR", str(tmp_path / "cfg")) + monkeypatch.setenv("IR_DATA_DIR", str(tmp_path / "data")) + monkeypatch.setenv("IR_CACHE_DIR", str(tmp_path / "cache")) + docs = tmp_path / "docs" + docs.mkdir() + (docs / "a.md").write_text("alpha apple avocado", encoding="utf-8") + ir.register( + "n", + "files", + root=str(docs), + embedder="light", + maintenance={"reindex": {"on": "interval", "every_hours": 24}}, + ) + + now = datetime(2026, 6, 16, 12) + (r1,) = ir.maintain("n", now=now) + assert r1.ran and r1.reindex and r1.records >= 1 + + (r2,) = ir.maintain("n", now=now + timedelta(hours=1)) # within interval + assert not r2.ran and "not due" in r2.reason + + (r3,) = ir.maintain("n", now=now + timedelta(hours=25)) # past interval + assert r3.ran + + +def test_maintain_defers_synopsis_build_outside_downtime(tmp_path, monkeypatch): + monkeypatch.setenv("IR_CONFIG_DIR", str(tmp_path / "cfg")) + monkeypatch.setenv("IR_DATA_DIR", str(tmp_path / "data")) + monkeypatch.setenv("IR_CACHE_DIR", str(tmp_path / "cache")) + docs = tmp_path / "docs" + docs.mkdir() + (docs / "a.md").write_text("alpha", encoding="utf-8") + ir.register( + "n", + "files", + root=str(docs), + embedder="light", + maintenance={ + "reindex": {"on": "source-change"}, + "synopsis": {"enabled": True, "downtime_hours": [2, 6]}, + }, + ) + # Noon is outside [2, 6): the (LLM-touching) synopsis build is deferred, so + # nothing is built — and crucially no aix/LLM is reached. + (r,) = ir.maintain("n", now=datetime(2026, 6, 16, 12)) + assert not r.ran and "downtime" in r.reason + assert len(ir.open_corpus("n")) == 0 + + +def test_maintain_dry_run_does_not_build(tmp_path, monkeypatch): + monkeypatch.setenv("IR_CONFIG_DIR", str(tmp_path / "cfg")) + monkeypatch.setenv("IR_DATA_DIR", str(tmp_path / "data")) + monkeypatch.setenv("IR_CACHE_DIR", str(tmp_path / "cache")) + docs = tmp_path / "docs" + docs.mkdir() + (docs / "a.md").write_text("alpha", encoding="utf-8") + ir.register("n", "files", root=str(docs), embedder="light") + (r,) = ir.maintain("n", now=datetime(2026, 6, 16, 12), dry_run=True) + assert not r.ran and "dry-run" in r.reason + # nothing built: corpus is still empty + assert len(ir.open_corpus("n")) == 0 diff --git a/tests/test_sessions_corpus.py b/tests/test_sessions_corpus.py new file mode 100644 index 0000000..f15686c --- /dev/null +++ b/tests/test_sessions_corpus.py @@ -0,0 +1,139 @@ +"""Claude Code sessions corpus — ClaudeTurn strategy + from_claude_sessions (#57). + +The transcript parsing lives in ``priv``; here we inject a ``fetcher`` of +turn-pair records so the ir-side corpus (surfaces, filter fields, presets) is +tested without ``priv`` or real transcripts. +""" + +import ir +from ir.store import CorpusStore +from ir.strategy import ClaudeTurn, strategy_from_spec, strategy_to_spec + +FAKE = [ + { + "id": "s1:u1", + "user_prompt": "how do I deploy the app to the server", + "assistant_summary": "I deployed it via the deploy script and verified it live", + "assistant_full": "Let me check the config... I deployed it via the deploy " + "script and verified it live", + "session_id": "s1", + "cwd": "/x/myproj", + "project": "myproj", + "git_branch": "main", + "timestamp": "2026-06-10T10:00:00Z", + "model": "claude-opus-4-8", + "has_tool_use": True, + }, + { + "id": "s1:u2", + "user_prompt": "fix the failing numpy import error", + "assistant_summary": "The numpy 2.x ABI mismatch caused it; pinned numpy " + "below 2 and the tests pass now", + "assistant_full": "Investigating the traceback... The numpy 2.x ABI mismatch " + "caused it; pinned numpy below 2 and the tests pass now", + "session_id": "s1", + "cwd": "/x/myproj", + "project": "myproj", + "git_branch": "main", + "timestamp": "2026-06-11T09:00:00Z", + "model": "claude-opus-4-8", + "has_tool_use": False, + }, + { + "id": "s2:u1", + "user_prompt": "add a dark mode toggle", + "assistant_summary": "Added a dark mode toggle wired to a theme context", + "assistant_full": "Added a dark mode toggle wired to a theme context", + "session_id": "s2", + "cwd": "/x/other", + "project": "other", + "git_branch": "dev", + "timestamp": "2026-06-12T08:00:00Z", + "model": "claude-sonnet-4-6", + "has_tool_use": True, + }, +] + + +def _corpus(**kw): + src = ir.CorpusSource.from_claude_sessions(fetcher=lambda: FAKE, **kw) + return ir.build(src, store=CorpusStore.memory(), embedder="light") + + +def test_claudeturn_default_surfaces_and_filter_fields(): + plan = ClaudeTurn().decompose("s1:u1", FAKE[0]) + assert {s.kind for s in plan.surfaces} == {"user_prompt", "assistant_summary"} + assert plan.filter_fields["project"] == "myproj" + assert plan.filter_fields["has_tool_use"] is True + assert plan.filter_fields["model"] == "claude-opus-4-8" + + +def test_claudeturn_include_full_adds_surface(): + plan = ClaudeTurn(include_full=True).decompose("s1:u1", FAKE[0]) + assert "assistant_full" in {s.kind for s in plan.surfaces} + + +def test_claudeturn_session_title_record_indexes_the_title(): + rec = { + "id": "s9:__title__", + "record_type": "session_title", + "session_title": "Fixing the numpy ABI crash", + "session_id": "s9", + "project": "ir", + "user_prompt": "", + "assistant_summary": "", + } + plan = ClaudeTurn().decompose("s9:__title__", rec) + assert {s.kind for s in plan.surfaces} == {"session_title"} + assert plan.surfaces[0].text == "Fixing the numpy ABI crash" + assert plan.filter_fields["session_title"] == "Fixing the numpy ABI crash" + + +def test_claudeturn_turn_record_carries_session_title_as_metadata(): + rec = dict(FAKE[0], session_title="the deploy session") + plan = ClaudeTurn().decompose(rec["id"], rec) + # a turn record still indexes user/assistant — not a title surface + assert {s.kind for s in plan.surfaces} == {"user_prompt", "assistant_summary"} + assert plan.filter_fields["session_title"] == "the deploy session" + + +def test_build_indexes_two_surfaces_per_turn(): + corpus = _corpus() + assert len(corpus) == 3 * 2 # 3 turns × {user_prompt, assistant_summary} + + +def test_search_targets_assistant_side(): + corpus = _corpus() + hits = corpus.search( + "numpy abi mismatch", surfaces={"assistant_summary"}, mode="lexical", k=3 + ) + assert hits and hits[0].artifact_id == "s1:u2" + + +def test_search_targets_user_side(): + corpus = _corpus() + hits = corpus.search( + "deploy the app", surfaces={"user_prompt"}, mode="lexical", k=3 + ) + assert hits and hits[0].artifact_id == "s1:u1" + + +def test_filter_by_project_discriminates(): + corpus = _corpus() + hits = corpus.search("toggle", filter={"project": "other"}, mode="lexical", k=5) + assert hits and all(h.metadata.get("project") == "other" for h in hits) + + +def test_claudeturn_spec_roundtrips_for_registry_persistence(): + spec = strategy_to_spec(ClaudeTurn(include_full=True)) + assert spec["name"] == "ClaudeTurn" and spec["params"]["include_full"] is True + s2 = strategy_from_spec(spec) + assert isinstance(s2, ClaudeTurn) and s2.include_full is True + + +def test_sessions_preset_resolves_and_has_interval_policy(tmp_path, monkeypatch): + monkeypatch.setenv("IR_CONFIG_DIR", str(tmp_path / "cfg")) + # The sessions kind's smart default is interval reindex with a downtime window. + pol = ir.default_policy_for_kind("sessions") + assert pol.reindex.on == "interval" + assert pol.synopsis.downtime_hours == (2, 6) diff --git a/tests/test_store_packed.py b/tests/test_store_packed.py new file mode 100644 index 0000000..b255cbb --- /dev/null +++ b/tests/test_store_packed.py @@ -0,0 +1,100 @@ +"""Packed-matrix disk cache + vector-free ``metas()`` — the #56 perf paths. + +These exercise the file-backed (``packed_dir``) behavior that the in-memory +store in ``test_store.py`` cannot: persisting one normalized matrix so a fresh +process reopens it with a few reads instead of a per-record vector-file storm, +and serving ``metas()`` without touching vectors (for lexical-only ranking). +""" + +import numpy as np + +from ir.base import Record +from ir.store import CorpusStore, _json_store, _ndarray_store + + +def _rec(rid, *, vec=(1.0, 0.0, 0.0), text="t"): + return Record( + id=rid, + artifact_id=f"art_{rid}", + surface_kind="document", + surface_index=0, + text=text, + vector=np.asarray(vec, dtype=np.float32), + metadata={"owner": "ours"}, + ) + + +def _file_store(tmp_path): + """A file-backed store with the on-disk packed cache enabled.""" + root = tmp_path / "corpus" + store = CorpusStore( + meta=_json_store(root / "meta"), + vectors=_ndarray_store(root / "vectors"), + ledger=_json_store(root / "ledger"), + config=_json_store(root / "config"), + calibration=_json_store(root / "calibration"), + links=_json_store(root / "links"), + packed_dir=root / "matrix", + ) + return store, root + + +def test_packed_cache_written_then_reloaded_by_fresh_store(tmp_path): + store, root = _file_store(tmp_path) + store.put_record(_rec("r1", vec=(3.0, 0.0, 0.0))) + store.put_record(_rec("r2", vec=(0.0, 4.0, 0.0))) + ids, mat, metas = store.matrix() # builds from records + persists packed + assert (root / "matrix" / "sig.json").exists() + + # A brand-new store over the same dir must reload from the packed cache and + # return identical ids / normalized matrix / metas. + store2, _ = _file_store(tmp_path) + ids2, mat2, metas2 = store2.matrix() + assert ids2 == ids + np.testing.assert_allclose(np.asarray(mat2), np.asarray(mat), atol=1e-6) + assert metas2 == metas + + +def test_packed_cache_cleared_on_write_then_rebuilt(tmp_path): + store, root = _file_store(tmp_path) + store.put_record(_rec("r1")) + store.matrix() + sig = root / "matrix" / "sig.json" + assert sig.exists() + + store.put_record(_rec("r2")) # any write invalidates the packed cache + assert not sig.exists() + + ids, mat, _ = store.matrix() # rebuilds with both rows + re-persists + assert mat.shape[0] == 2 + assert sig.exists() + + +def test_metas_matches_matrix_metas(tmp_path): + store, _ = _file_store(tmp_path) + store.put_record(_rec("r1", text="alpha")) + store.put_record(_rec("r2", text="beta")) + ids_m, metas_m = store.metas() # vector-free path + ids, _mat, metas = store.matrix() + assert ids_m == ids + assert metas_m == metas + + +def test_corrupt_packed_cache_falls_back_to_rebuild(tmp_path): + store, root = _file_store(tmp_path) + store.put_record(_rec("r1")) + store.matrix() + # A torn / corrupt sig must read as invalid → rebuild, never raise. + (root / "matrix" / "sig.json").write_text("not json", encoding="utf-8") + store2, _ = _file_store(tmp_path) + ids, mat, metas = store2.matrix() + assert mat.shape[0] == 1 + + +def test_memory_store_keeps_purely_in_process(tmp_path): + store = CorpusStore.memory() + assert store._packed_dir is None # no disk cache for the in-memory store + store.put_record(_rec("r1")) + assert store.matrix()[1].shape == (1, 3) + ids, metas = store.metas() + assert ids == ["r1"] and len(metas) == 1 diff --git a/tests/test_synopsis.py b/tests/test_synopsis.py index 192ad5f..11542df 100644 --- a/tests/test_synopsis.py +++ b/tests/test_synopsis.py @@ -10,7 +10,7 @@ - incremental rebuild re-synthesizes only changed artifacts, and a synthesizer-identity (or inner-strategy) change re-synthesizes — staleness via the ledger ``strategy_id`` and the recursive ``_strategy_id``. -- offline import preserved: ``oa`` is lazy, an injected synthesizer never needs it. +- offline import preserved: ``aix`` is lazy, an injected synthesizer never needs it. Hermetic: light embedder + memory store + injected synthesizers. """ @@ -187,7 +187,7 @@ def test_strategy_id_recurses_into_inner_strategy_and_synthesizer(): # --------------------------------------------------------------------------- # -# Offline: oa is lazy; an injected synthesizer never needs it +# Offline: aix is lazy; an injected synthesizer never needs it # --------------------------------------------------------------------------- # @@ -209,15 +209,15 @@ def boom(text): def test_default_synthesizer_constructs_offline_and_stamps_identity(): - # No synthesize injected -> the lazy-oa default; constructing must not need oa, - # and an empty-text artifact short-circuits before the oa path is reached. + # No synthesize injected -> the lazy-aix default; constructing must not need + # aix, and an empty-text artifact short-circuits before the aix path is reached. strat = with_synopsis(ir.Chunked()) - assert strat.synthesizer_id.startswith("oa:") + assert strat.synthesizer_id.startswith("aix:") assert strat.synthesize(Artifact("x", "")) == "" # a prompt change shifts the default identity (re-synthesis trigger) a = make_llm_synthesizer() b = make_llm_synthesizer(prompt="a very different prompt {text}") - assert a.synthesizer_id.startswith("oa:") and a.synthesizer_id != b.synthesizer_id + assert a.synthesizer_id.startswith("aix:") and a.synthesizer_id != b.synthesizer_id # --------------------------------------------------------------------------- # @@ -313,25 +313,25 @@ def test_with_synopsis_package_prepends_synopsis_before_description_and_routes() def test_default_synthesizer_construction_is_lazy_and_offline(monkeypatch): - # Mutation-resistant offline guarantee: poison oa so any use raises, then + # Mutation-resistant offline guarantee: poison aix so any use raises, then # confirm constructing the default synthesizer (and the wrapper) and the # injected path never reach it. An eager-import regression would fail here. import sys import types - poison = types.ModuleType("oa") + poison = types.ModuleType("aix") def _boom(*a, **k): - raise AssertionError("oa.prompt_function reached on an offline path") + raise AssertionError("aix.prompt_func reached on an offline path") - poison.prompt_function = _boom - monkeypatch.setitem(sys.modules, "oa", poison) + poison.prompt_func = _boom + monkeypatch.setitem(sys.modules, "aix", poison) synth = make_llm_synthesizer() # lazy: builds nothing yet - strat = with_synopsis(ir.Chunked()) # default synth: still no oa - assert synth(Artifact("x", "")) == "" # empty text short-circuits the oa path + strat = with_synopsis(ir.Chunked()) # default synth: still no aix + assert synth(Artifact("x", "")) == "" # empty text short-circuits the aix path assert strat.synthesize(Artifact("y", "")) == "" - # an injected summarizer is wholly oa-free even on non-empty text + # an injected summarizer is wholly aix-free even on non-empty text assert make_llm_synthesizer(summarize=lambda t: "S")(Artifact("z", "body")) == "S"