diff --git a/ir/strategy.py b/ir/strategy.py index 08cf8cd..977d022 100644 --- a/ir/strategy.py +++ b/ir/strategy.py @@ -26,7 +26,7 @@ import re from collections.abc import Mapping -from typing import Any, Protocol, runtime_checkable +from typing import Any, Callable, Protocol, runtime_checkable from .base import IndexPlan, Surface @@ -231,12 +231,39 @@ def strategy_from_spec(spec: Mapping[str, Any] | None) -> "IndexingStrategy | No return cls(**dict(spec.get("params") or {})) +def _default_deps_text(deps: list[str]) -> str: + """Prefix-form serialization of bare dependency names for the ``deps`` surface. + + Strips version specifiers / extras / markers (via :func:`ir.graph._dep_name`), + de-duplicates, preserves order, and drops empties — e.g. + ``["numpy>=1", "sentence-transformers", "numpy"]`` -> + ``"Depends on: numpy, sentence-transformers"``. Returns ``""`` when there are + no usable names (``decompose`` then drops the empty surface). + """ + from .graph import _dep_name + + names = list(dict.fromkeys(n for n in (_dep_name(d) for d in deps) if n)) + return "Depends on: " + ", ".join(names) if names else "" + + class Package: """Package strategy: ``name + description`` surface plus README chunks. Filter fields capture ownership (ours vs third-party), name, deps. AI synopsis / problem-class surfaces are a documented extension point. + With ``embed_deps=True``, ``decompose`` additionally emits one + ``Surface(kind="deps", granularity="field")`` whose text is a prefix-form + serialization of the **bare** dependency names (``deps_template``, default + :func:`_default_deps_text`) — so a query for a domain matches a package by the + libraries it depends on (e.g. ``sentence-transformers`` -> embeddings, + ``networkx`` -> graphs), and the BM25 leg picks up exact dep-token matches. The + deps bag is kept **separate from prose** (its own surface) so a rare library + name is not diluted, and deps remain a filter field regardless. ``embed_deps`` + defaults ``False`` (today's behavior); it folds into the strategy id, so + toggling it re-decomposes incrementally. The deps surface is appended **last**, + leaving the ``description`` (position 0) and ``readme_chunk`` indices unchanged. + Surface indexing: the ``description`` surface (kept whenever *name* or *description* is non-empty) occupies plan position 0, so ``readme_chunk`` *j* is stored with @@ -250,9 +277,18 @@ class Package: strategy change) — read it with ``metadata.get("n_chunks")``. """ - def __init__(self, *, chunk_size: int = 1500, overlap: int = 200): + def __init__( + self, + *, + chunk_size: int = 1500, + overlap: int = 200, + embed_deps: bool = False, + deps_template: Callable[[list[str]], str] | None = None, + ): self.chunk_size = chunk_size self.overlap = overlap + self.embed_deps = embed_deps + self.deps_template = deps_template def decompose(self, artifact_id, raw, metadata=None) -> IndexPlan: meta = dict(metadata or {}) @@ -286,6 +322,15 @@ def decompose(self, artifact_id, raw, metadata=None) -> IndexPlan: metadata={"chunk_index": i, "n_chunks": len(chunks)}, ) ) + if self.embed_deps: + template = self.deps_template or _default_deps_text + deps_text = template(list(filter_fields["deps"])) + if deps_text and deps_text.strip(): + # Appended last: keeps the description (0) and readme_chunk indices + # stable, so the surface_index contract holds for existing corpora. + surfaces.append( + Surface(artifact_id, "deps", deps_text, granularity="field") + ) # Drop empty surfaces (e.g. no description and no README). surfaces = [s for s in surfaces if s.text.strip()] return IndexPlan(filter_fields=filter_fields, surfaces=surfaces) diff --git a/tests/test_strategy.py b/tests/test_strategy.py index 71b3b82..65634c8 100644 --- a/tests/test_strategy.py +++ b/tests/test_strategy.py @@ -1,6 +1,15 @@ """Unit tests for indexing strategies (artifact -> filter fields + surfaces).""" -from ir.strategy import Chunked, Package, Skill, WholeText, _split +from ir.strategy import ( + Chunked, + Package, + Skill, + WholeText, + _default_deps_text, + _split, + strategy_from_spec, + strategy_to_spec, +) def test_wholetext_str_and_mapping(): @@ -55,6 +64,64 @@ def test_package_description_plus_readme_chunks_and_filter_fields(): assert plan.filter_fields["name"] == "vd" assert plan.filter_fields["has_readme"] is True assert plan.filter_fields["deps"] == ["dol", "numpy"] + # embed_deps defaults off: no deps surface, today's behavior preserved + assert "deps" not in {s.kind for s in plan.surfaces} + + +def _pkg_raw(deps): + return { + "name": "vd", + "description": "Facade over vector databases.", + "readme": "Some readme body about vectors.", + "owner": "ours", + "deps": deps, + } + + +def test_package_embed_deps_adds_a_deps_surface_last(): + raw = _pkg_raw(["sentence-transformers>=2.0", "networkx", "numpy"]) + plan = Package(embed_deps=True).decompose("vd", raw, {}) + deps_surfaces = [s for s in plan.surfaces if s.kind == "deps"] + assert len(deps_surfaces) == 1 + s = deps_surfaces[0] + assert s.granularity == "field" + # bare names, version specifier stripped, prefix form + assert s.text == "Depends on: sentence-transformers, networkx, numpy" + # appended last (keeps description/readme_chunk surface_index stable) + assert plan.surfaces[-1].kind == "deps" + # deps remain a filter field too + assert plan.filter_fields["deps"] == ["sentence-transformers>=2.0", "networkx", "numpy"] + + +def test_package_embed_deps_empty_deps_yields_no_surface(): + plan = Package(embed_deps=True).decompose("vd", _pkg_raw([]), {}) + assert "deps" not in {s.kind for s in plan.surfaces} + + +def test_package_custom_deps_template(): + raw = _pkg_raw(["ef", "imbed"]) + plan = Package( + embed_deps=True, deps_template=lambda ds: "uses " + "|".join(ds) + ).decompose("vd", raw, {}) + s = next(s for s in plan.surfaces if s.kind == "deps") + assert s.text == "uses ef|imbed" + + +def test_default_deps_text_strips_dedups_and_lowercases(): + assert _default_deps_text(["NumPy>=1.2", "numpy", "torch[cuda]", "oa ; python_version>'3.9'"]) == ( + "Depends on: numpy, torch, oa" + ) + assert _default_deps_text([]) == "" + + +def test_package_embed_deps_round_trips_via_spec(): + spec = strategy_to_spec(Package(embed_deps=True, chunk_size=900)) + assert spec["name"] == "Package" + assert spec["params"]["embed_deps"] is True # bool param captured + restored = strategy_from_spec(spec) + assert isinstance(restored, Package) + assert restored.embed_deps is True + assert restored.chunk_size == 900 def test_split_packs_to_chunk_size_not_per_paragraph():