Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 47 additions & 2 deletions ir/strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@

import re
from collections.abc import Mapping
from typing import Any, Protocol, runtime_checkable
from typing import Any, Callable, Protocol, runtime_checkable

from .base import IndexPlan, Surface

Expand Down Expand Up @@ -231,12 +231,39 @@ def strategy_from_spec(spec: Mapping[str, Any] | None) -> "IndexingStrategy | No
return cls(**dict(spec.get("params") or {}))


def _default_deps_text(deps: list[str]) -> str:
"""Prefix-form serialization of bare dependency names for the ``deps`` surface.

Strips version specifiers / extras / markers (via :func:`ir.graph._dep_name`),
de-duplicates, preserves order, and drops empties — e.g.
``["numpy>=1", "sentence-transformers", "numpy"]`` ->
``"Depends on: numpy, sentence-transformers"``. Returns ``""`` when there are
no usable names (``decompose`` then drops the empty surface).
"""
from .graph import _dep_name

names = list(dict.fromkeys(n for n in (_dep_name(d) for d in deps) if n))
return "Depends on: " + ", ".join(names) if names else ""


class Package:
"""Package strategy: ``name + description`` surface plus README chunks.

Filter fields capture ownership (ours vs third-party), name, deps. AI
synopsis / problem-class surfaces are a documented extension point.

With ``embed_deps=True``, ``decompose`` additionally emits one
``Surface(kind="deps", granularity="field")`` whose text is a prefix-form
serialization of the **bare** dependency names (``deps_template``, default
:func:`_default_deps_text`) — so a query for a domain matches a package by the
libraries it depends on (e.g. ``sentence-transformers`` -> embeddings,
``networkx`` -> graphs), and the BM25 leg picks up exact dep-token matches. The
deps bag is kept **separate from prose** (its own surface) so a rare library
name is not diluted, and deps remain a filter field regardless. ``embed_deps``
defaults ``False`` (today's behavior); it folds into the strategy id, so
toggling it re-decomposes incrementally. The deps surface is appended **last**,
leaving the ``description`` (position 0) and ``readme_chunk`` indices unchanged.

Surface indexing: the ``description`` surface (kept whenever *name* or
*description* is non-empty) occupies plan position 0, so ``readme_chunk``
*j* is stored with
Expand All @@ -250,9 +277,18 @@ class Package:
strategy change) — read it with ``metadata.get("n_chunks")``.
"""

def __init__(self, *, chunk_size: int = 1500, overlap: int = 200):
def __init__(
self,
*,
chunk_size: int = 1500,
overlap: int = 200,
embed_deps: bool = False,
deps_template: Callable[[list[str]], str] | None = None,
):
self.chunk_size = chunk_size
self.overlap = overlap
self.embed_deps = embed_deps
self.deps_template = deps_template

def decompose(self, artifact_id, raw, metadata=None) -> IndexPlan:
meta = dict(metadata or {})
Expand Down Expand Up @@ -286,6 +322,15 @@ def decompose(self, artifact_id, raw, metadata=None) -> IndexPlan:
metadata={"chunk_index": i, "n_chunks": len(chunks)},
)
)
if self.embed_deps:
template = self.deps_template or _default_deps_text
deps_text = template(list(filter_fields["deps"]))
if deps_text and deps_text.strip():
# Appended last: keeps the description (0) and readme_chunk indices
# stable, so the surface_index contract holds for existing corpora.
surfaces.append(
Surface(artifact_id, "deps", deps_text, granularity="field")
)
# Drop empty surfaces (e.g. no description and no README).
surfaces = [s for s in surfaces if s.text.strip()]
return IndexPlan(filter_fields=filter_fields, surfaces=surfaces)
Expand Down
69 changes: 68 additions & 1 deletion tests/test_strategy.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,15 @@
"""Unit tests for indexing strategies (artifact -> filter fields + surfaces)."""

from ir.strategy import Chunked, Package, Skill, WholeText, _split
from ir.strategy import (
Chunked,
Package,
Skill,
WholeText,
_default_deps_text,
_split,
strategy_from_spec,
strategy_to_spec,
)


def test_wholetext_str_and_mapping():
Expand Down Expand Up @@ -55,6 +64,64 @@ def test_package_description_plus_readme_chunks_and_filter_fields():
assert plan.filter_fields["name"] == "vd"
assert plan.filter_fields["has_readme"] is True
assert plan.filter_fields["deps"] == ["dol", "numpy"]
# embed_deps defaults off: no deps surface, today's behavior preserved
assert "deps" not in {s.kind for s in plan.surfaces}


def _pkg_raw(deps):
return {
"name": "vd",
"description": "Facade over vector databases.",
"readme": "Some readme body about vectors.",
"owner": "ours",
"deps": deps,
}


def test_package_embed_deps_adds_a_deps_surface_last():
raw = _pkg_raw(["sentence-transformers>=2.0", "networkx", "numpy"])
plan = Package(embed_deps=True).decompose("vd", raw, {})
deps_surfaces = [s for s in plan.surfaces if s.kind == "deps"]
assert len(deps_surfaces) == 1
s = deps_surfaces[0]
assert s.granularity == "field"
# bare names, version specifier stripped, prefix form
assert s.text == "Depends on: sentence-transformers, networkx, numpy"
# appended last (keeps description/readme_chunk surface_index stable)
assert plan.surfaces[-1].kind == "deps"
# deps remain a filter field too
assert plan.filter_fields["deps"] == ["sentence-transformers>=2.0", "networkx", "numpy"]


def test_package_embed_deps_empty_deps_yields_no_surface():
plan = Package(embed_deps=True).decompose("vd", _pkg_raw([]), {})
assert "deps" not in {s.kind for s in plan.surfaces}


def test_package_custom_deps_template():
raw = _pkg_raw(["ef", "imbed"])
plan = Package(
embed_deps=True, deps_template=lambda ds: "uses " + "|".join(ds)
).decompose("vd", raw, {})
s = next(s for s in plan.surfaces if s.kind == "deps")
assert s.text == "uses ef|imbed"


def test_default_deps_text_strips_dedups_and_lowercases():
assert _default_deps_text(["NumPy>=1.2", "numpy", "torch[cuda]", "oa ; python_version>'3.9'"]) == (
"Depends on: numpy, torch, oa"
)
assert _default_deps_text([]) == ""


def test_package_embed_deps_round_trips_via_spec():
spec = strategy_to_spec(Package(embed_deps=True, chunk_size=900))
assert spec["name"] == "Package"
assert spec["params"]["embed_deps"] is True # bool param captured
restored = strategy_from_spec(spec)
assert isinstance(restored, Package)
assert restored.embed_deps is True
assert restored.chunk_size == 900


def test_split_packs_to_chunk_size_not_per_paragraph():
Expand Down
Loading