diff --git a/ir/eval.py b/ir/eval.py index 6a75334..cb9d1ab 100644 --- a/ir/eval.py +++ b/ir/eval.py @@ -1762,3 +1762,588 @@ def objective(g: dict) -> float: return w * g["sensitivity"] + (1.0 - w) * g["specificity"] return max(grid, key=lambda g: (objective(g), g["min_score"])) + + +# =========================================================================== # +# Package-relevance graded harness (issue #66 / tracking #61) +# +# A graded, named-diagnostic scoreboard for topical-relevance runs. Where a +# DiscoveryCase is (query -> gold ids) with a flat-binary gold, here the unit is +# one *artifact* carrying a graded level per theme, so graded gains reach nDCG +# and a *named distractor* / *hard-positive* set can be regression-checked. It is +# the shared, offline, model-free measurement contract that both ef model +# bake-offs and the raglab labeling recipe score against. +# =========================================================================== # + +#: Graded relevance levels for the package-relevance harness, weakest -> strongest. +#: The SSOT consumed by :data:`LEVEL_GAINS`, :class:`PackageRelevanceCase`, and +#: :func:`to_graded_qrels`. +RELEVANCE_LEVELS = ("none", "tangential", "uses-tools", "strong", "core") + +#: Graded relevance gains per level — fed straight to ``ef.evaluation``'s graded +#: nDCG. ``uses-tools`` is a *weak* positive (a package that merely uses the +#: relevant tooling); ``tangential``/``none`` are non-relevant (gain ``0``). These +#: gains are the knob that lets nDCG rank a core-first ordering strictly above a +#: uses-tools-first one at identical recall. +LEVEL_GAINS = { + "none": 0.0, + "tangential": 0.0, + "uses-tools": 1.0, + "strong": 2.0, + "core": 3.0, +} + + +@dataclass(frozen=True) +class PackageRelevanceCase: + """One artifact's graded relevance to one or more themes. + + Unlike :class:`DiscoveryCase` (one query -> gold ids, flat-binary gold), a + case here is one **artifact** carrying its graded label per theme. The + per-theme *probe text* (the query each theme is scored with) is **not** on the + case — it lives in the JSONL ``__meta__`` header (see + :func:`save_package_cases`), alongside the corpus signature + (:func:`ir.eval_gen.corpus_signature`), so a frozen label set pins to the + corpus snapshot it was judged against. + + Attributes: + artifact_id: the package id (matches the corpus' ``artifact_id``). + labels: ``{theme: level}`` with each level in :data:`RELEVANCE_LEVELS`. + evidence: ``{theme: short reason}`` — the human-auditable *why*. + observed: ``{theme: score}`` the ranking gave this artifact in the frozen + run (optional; used to *derive* the named distractor set). + thin_description: True when the package had an empty/near-empty pyproject + description (used to derive the hard-positive set). + is_distractor: optional cached ``{theme: bool}``; when absent it is + derived (see :func:`derive_named_sets`). + metadata: free-form per-case metadata. + """ + + artifact_id: str + labels: Mapping[str, str] = field(default_factory=dict) + evidence: Mapping[str, str] = field(default_factory=dict) + observed: Mapping[str, float] = field(default_factory=dict) + thin_description: bool = False + is_distractor: Mapping[str, bool] = field(default_factory=dict) + metadata: Mapping[str, Any] = field(default_factory=dict) + + def level(self, theme: str) -> str: + """This artifact's graded level for ``theme`` (``"none"`` if unlabeled).""" + return self.labels.get(theme, "none") + + def gain(self, theme: str, *, gains: Mapping[str, float] = LEVEL_GAINS) -> float: + """The graded gain for ``theme`` under ``gains`` (default :data:`LEVEL_GAINS`). + + A level missing from ``gains`` (e.g. a partial caller-supplied mapping) + defaults to ``0.0`` — the non-relevant convention — rather than raising. + """ + return float(gains.get(self.level(theme), 0.0)) + + def to_dict(self) -> dict: + """JSON-serializable form (omitting empty optional fields).""" + out: dict[str, Any] = { + "artifact_id": self.artifact_id, + "labels": dict(self.labels), + } + if self.evidence: + out["evidence"] = dict(self.evidence) + if self.observed: + out["observed"] = {k: float(v) for k, v in self.observed.items()} + if self.thin_description: + out["thin_description"] = True + if self.is_distractor: + out["is_distractor"] = dict(self.is_distractor) + if self.metadata: + out["metadata"] = dict(self.metadata) + return out + + @classmethod + def from_dict(cls, d: Mapping[str, Any]) -> "PackageRelevanceCase": + """Inverse of :meth:`to_dict`; rejects any level outside :data:`RELEVANCE_LEVELS`.""" + labels = dict(d.get("labels") or {}) + bad = {t: lv for t, lv in labels.items() if lv not in RELEVANCE_LEVELS} + if bad: + raise ValueError( + f"unknown relevance level(s) {bad}; " + f"expected one of {RELEVANCE_LEVELS}" + ) + return cls( + artifact_id=d["artifact_id"], + labels=labels, + evidence=dict(d.get("evidence") or {}), + observed={k: float(v) for k, v in (d.get("observed") or {}).items()}, + thin_description=bool(d.get("thin_description", False)), + is_distractor=dict(d.get("is_distractor") or {}), + metadata=dict(d.get("metadata") or {}), + ) + + +def save_package_cases( + cases: Iterable[PackageRelevanceCase], + path: str | Path, + *, + meta: Mapping[str, Any] | None = None, +) -> None: + """Write :class:`PackageRelevanceCase`\\ s to JSONL (mirrors :func:`save_cases`). + + ``meta`` is written as a leading ``{"__meta__": …}`` line — the home for the + per-theme probe text (``probes``) and the corpus signature + (:func:`ir.eval_gen.corpus_signature`) that pin the labels to a corpus snapshot. + """ + path = Path(path) + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("w", encoding="utf-8") as out: + if meta is not None: + out.write(json.dumps({"__meta__": dict(meta)}) + "\n") + for case in cases: + out.write(json.dumps(case.to_dict()) + "\n") + + +def load_package_cases(path: str | Path) -> list[PackageRelevanceCase]: + """Read :class:`PackageRelevanceCase`\\ s from JSONL (skips a ``__meta__`` header).""" + cases: list[PackageRelevanceCase] = [] + for line in Path(path).read_text(encoding="utf-8").splitlines(): + line = line.strip() + if not line: + continue + obj = json.loads(line) + if "__meta__" in obj: + continue + cases.append(PackageRelevanceCase.from_dict(obj)) + return cases + + +def read_package_meta(path: str | Path) -> dict[str, Any]: + """Return the ``__meta__`` header dict from a package-cases JSONL (``{}`` if absent). + + The header carries the per-theme probe text (``probes``) and the corpus + signature the labels were frozen against — everything :func:`compare_indexings` + needs that is not on a per-artifact case. + """ + for line in Path(path).read_text(encoding="utf-8").splitlines(): + line = line.strip() + if not line: + continue + obj = json.loads(line) + if "__meta__" in obj: + return dict(obj["__meta__"]) + return {} + return {} + + +def level_histogram( + cases: Sequence[PackageRelevanceCase], theme: str +) -> dict[str, int]: + """Count of artifacts at each level for ``theme`` (every level represented).""" + counts = Counter(c.level(theme) for c in cases) + return {level: counts.get(level, 0) for level in RELEVANCE_LEVELS} + + +def to_graded_qrels( + cases: Sequence[PackageRelevanceCase], + theme: str, + *, + probe: str | None = None, + gains: Mapping[str, float] = LEVEL_GAINS, + query_id: str | None = None, +) -> tuple[dict[str, str], dict[str, dict[str, float]]]: + """Build ``ef``'s **graded** ``(queries, qrels)`` for one ``theme``. + + Unlike :func:`to_qrels` (which hardcodes grade ``1``), every positive artifact + is judged at its graded :data:`LEVEL_GAINS` value, so graded gains reach + ``ef.evaluation``'s nDCG with zero ``ef`` change. There is exactly **one** + query per theme (the theme *probe*); ``qrels`` maps it to ``{artifact_id: + gain}`` for every artifact with a positive gain. Non-positive + (``none``/``tangential``) artifacts are omitted, matching ``ef``'s + judged-positive qrels convention. + """ + qid = query_id or f"theme:{theme}" + queries = {qid: probe or theme} + qrels = { + qid: { + c.artifact_id: c.gain(theme, gains=gains) + for c in cases + if c.gain(theme, gains=gains) > 0 + } + } + return queries, qrels + + +@dataclass(frozen=True) +class NamedSets: + """The two named diagnostic sets for one theme. + + Attributes: + theme: the theme these sets diagnose. + distractors: ids that are NOT relevant (``none``/``tangential``) yet rank + prominently — every appearance in the top-``k`` is a false positive. + hard_positives: ids that ARE relevant (``core``/``strong``) but hard to + rank (e.g. thin-description packages) — recall on these is the + headline gap a fix must close. + """ + + theme: str + distractors: tuple[str, ...] = () + hard_positives: tuple[str, ...] = () + + def to_dict(self) -> dict: + """JSON-serializable form.""" + return { + "theme": self.theme, + "distractors": list(self.distractors), + "hard_positives": list(self.hard_positives), + } + + @classmethod + def from_dict(cls, d: Mapping[str, Any]) -> "NamedSets": + """Inverse of :meth:`to_dict`.""" + return cls( + theme=d["theme"], + distractors=tuple(d.get("distractors") or ()), + hard_positives=tuple(d.get("hard_positives") or ()), + ) + + +def derive_named_sets( + cases: Sequence[PackageRelevanceCase], + theme: str, + *, + observed_floor: float, + gains: Mapping[str, float] = LEVEL_GAINS, +) -> NamedSets: + """Derive the :class:`NamedSets` for ``theme`` deterministically from labels. + + ``distractors`` = non-relevant (``none``/``tangential``) packages whose frozen + ``observed`` score for the theme is at or above ``observed_floor`` (they + ranked high enough to pollute the top-``k``) — unless a case carries an + explicit cached ``is_distractor[theme]``, which then overrides the + observed-floor rule. ``hard_positives`` = relevant (``core``/``strong``) + packages flagged ``thin_description`` (the ones a description+README index + struggles to surface). Both lists are de-duplicated and sorted, so the result + is a stable, committable artifact. + """ + distractors: list[str] = [] + hard_positives: list[str] = [] + for c in cases: + cached = c.is_distractor.get(theme) + if cached is not None: + is_distractor = bool(cached) + else: + positive = c.gain(theme, gains=gains) > 0 + observed = c.observed.get(theme) + is_distractor = ( + not positive and observed is not None and observed >= observed_floor + ) + if is_distractor: + distractors.append(c.artifact_id) + if c.level(theme) in ("core", "strong") and c.thin_description: + hard_positives.append(c.artifact_id) + return NamedSets( + theme=theme, + distractors=tuple(sorted(set(distractors))), + hard_positives=tuple(sorted(set(hard_positives))), + ) + + +def fp_rate_on_distractors( + ranking: Sequence[str], distractor_ids: Iterable[str], *, k: int +) -> float: + """Fraction of named distractors that appear in the top-``k`` of ``ranking``. + + A distractor surfacing in the committed top-``k`` is a false positive; this is + the rate over the named set (``0.0`` when the set is empty). Duplicate ids are + counted once. + """ + ids = tuple(dict.fromkeys(distractor_ids)) + if not ids: + return 0.0 + top = set(ranking[:k]) + return sum(1 for d in ids if d in top) / len(ids) + + +def recall_on_hard_positives( + ranking: Sequence[str], hard_positive_ids: Iterable[str], *, k: int +) -> float: + """Fraction of named hard-positives in the top-``k`` (duplicate ids counted once).""" + ids = tuple(dict.fromkeys(hard_positive_ids)) + if not ids: + return 0.0 + top = set(ranking[:k]) + return sum(1 for h in ids if h in top) / len(ids) + + +def _probe_hits( + corpus: Any, + probe: str, + *, + mode: str, + k: int, + surfaces: Iterable[str] | None = None, + **search_kw: Any, +) -> list[SearchHit]: + """Run one theme probe and return its ranked hits (best first, per artifact).""" + corpus = _as_corpus(corpus) + return _search( + corpus, probe, k=k, mode=mode, surfaces=surfaces, per_artifact=True, **search_kw + ) + + +@dataclass(frozen=True) +class NamedSetReport: + """The outcome of :func:`evaluate_named_sets` for one theme. + + Attributes: + theme / k: the theme and the rank cutoff scored. + fp_rate: distractor false-positive rate at ``k`` (lower is better). + hard_positive_recall: recall over the hard-positive set at ``k`` (higher + is better). + distractors_seen: the distractor ids actually in the top-``k`` — so the + ``fp_rate`` is auditable back to specific packages. + hard_positives_missed: the hard-positive ids NOT in the top-``k``. + """ + + theme: str + k: int + fp_rate: float + hard_positive_recall: float + distractors_seen: tuple[str, ...] = () + hard_positives_missed: tuple[str, ...] = () + + def to_dict(self) -> dict: + """JSON-serializable form.""" + return { + "theme": self.theme, + "k": self.k, + "fp_rate": self.fp_rate, + "hard_positive_recall": self.hard_positive_recall, + "distractors_seen": list(self.distractors_seen), + "hard_positives_missed": list(self.hard_positives_missed), + } + + def __str__(self) -> str: + return ( + f"[{self.theme}] fp_rate@{self.k}={self.fp_rate:.3f} " + f"(seen {list(self.distractors_seen)}) " + f"hard_pos_recall@{self.k}={self.hard_positive_recall:.3f} " + f"(missed {list(self.hard_positives_missed)})" + ) + + +def evaluate_named_sets( + corpus: Any, + named_sets: NamedSets, + theme: str | None = None, + *, + probe: str, + mode: str = DFLT_MODE, + k: int = 10, + surfaces: Iterable[str] | None = None, + **search_kw: Any, +) -> NamedSetReport: + """Score one theme's named distractor / hard-positive sets with a single probe. + + Runs the theme ``probe`` once, takes the per-artifact ranking, and reports the + distractor false-positive rate and hard-positive recall at ``k`` — plus the + distractor ids actually seen in the top-``k`` and the hard-positives missed, so + a number is always auditable back to specific packages. ``theme`` defaults to + ``named_sets.theme`` (the only thing it labels); pass it only to override. + """ + theme = theme or named_sets.theme + hits = _probe_hits(corpus, probe, mode=mode, k=k, surfaces=surfaces, **search_kw) + ranking = [h.artifact_id for h in hits] + top = set(ranking[:k]) + return NamedSetReport( + theme=theme, + k=k, + fp_rate=fp_rate_on_distractors(ranking, named_sets.distractors, k=k), + hard_positive_recall=recall_on_hard_positives( + ranking, named_sets.hard_positives, k=k + ), + distractors_seen=tuple(d for d in named_sets.distractors if d in top), + hard_positives_missed=tuple( + h for h in named_sets.hard_positives if h not in top + ), + ) + + +@dataclass(frozen=True) +class ComparisonReport: + """The outcome of :func:`compare_indexings` — an N-way indexing/embedder bake-off. + + Attributes: + k: the rank cutoff all ``@k`` metrics use. + themes / labels: the themes scored and the corpus labels compared. + baseline: the label every :meth:`regressions` check compares against + (the first entry of ``corpora``). + metrics: ``{label: {theme: {"ndcg", "fp_rate"?, "hard_positive_recall"?}}}``. + deltas: ``{theme: {artifact_id: {"role", "by_label": {label: {"rank", + "score"}}}}}`` for every named FP/FN id — the per-package effect. + """ + + k: int + themes: tuple[str, ...] + labels: tuple[str, ...] + baseline: str + metrics: Mapping[str, Mapping[str, Mapping[str, float]]] + deltas: Mapping[str, Mapping[str, dict]] + + def to_dict(self) -> dict: + """JSON-serializable form — the qh / HTTP surface.""" + return { + "k": self.k, + "themes": list(self.themes), + "labels": list(self.labels), + "baseline": self.baseline, + "metrics": { + label: {theme: dict(row) for theme, row in by_theme.items()} + for label, by_theme in self.metrics.items() + }, + "deltas": { + theme: {aid: dict(info) for aid, info in ids.items()} + for theme, ids in self.deltas.items() + }, + } + + def regressions( + self, *, threshold: int = 0, baseline: str | None = None + ) -> list[dict]: + """Named packages that got WORSE than ``baseline`` — drives a pytest gate. + + For a ``hard_positive``, worse = its rank dropped (grew larger) by more + than ``threshold`` positions. For a ``distractor``, worse = its rank rose + (grew smaller, more prominent) by more than ``threshold``. A package + absent from a ranking is treated as rank ``inf`` (worst), so a vanished + hard-positive and a newly-appearing distractor both register. Returns one + dict per regressing ``(theme, id, label)``. + """ + base = baseline or self.baseline + inf = 10**9 + out: list[dict] = [] + for theme, ids in self.deltas.items(): + for aid, info in ids.items(): + role = info.get("role") + by_label = info.get("by_label", {}) + if role is None or base not in by_label: + continue + base_rank = by_label[base].get("rank") or inf + for label, cell in by_label.items(): + if label == base: + continue + cand_rank = cell.get("rank") or inf + if role == "hard_positive": + worse = (cand_rank - base_rank) > threshold + else: # distractor: smaller rank == more prominent == worse + worse = (base_rank - cand_rank) > threshold + if worse: + out.append( + { + "theme": theme, + "artifact_id": aid, + "role": role, + "label": label, + # rank 0/absent -> None: ranks are 1-based, so 0 + # would misleadingly read as "top of list". + "baseline_rank": by_label[base].get("rank") or None, + "candidate_rank": cell.get("rank") or None, + } + ) + return out + + def __str__(self) -> str: + lines = [f"ComparisonReport(k={self.k}, baseline={self.baseline!r})"] + for label in self.labels: + for theme in self.themes: + row = self.metrics[label][theme] + parts = [f"ndcg@{self.k}={row['ndcg']:.3f}"] + if "fp_rate" in row: + parts.append(f"fp_rate={row['fp_rate']:.3f}") + if "hard_positive_recall" in row: + parts.append(f"hp_recall={row['hard_positive_recall']:.3f}") + lines.append(f" {label:<12} {theme:<12} " + " ".join(parts)) + return "\n".join(lines) + + +def compare_indexings( + corpora: Mapping[str, Any], + cases: Sequence[PackageRelevanceCase], + *, + themes: Sequence[str], + probes: Mapping[str, str], + k: int = 20, + named_sets: Mapping[str, NamedSets] | None = None, + mode: str = DFLT_MODE, + rank_depth: int = 1000, + gains: Mapping[str, float] = LEVEL_GAINS, + surfaces: Iterable[str] | None = None, + **search_kw: Any, +) -> ComparisonReport: + """A/B (or N-way) regression gate over indexing / embedder configurations. + + ``corpora`` maps a label -> a built corpus (an ``ef`` instruction-tuned-embedder + corpus or an ``ir`` deps-as-text corpus is just another entry — the harness is + embedder-agnostic). For each ``(label, theme)`` it computes graded nDCG@k (via + :func:`to_graded_qrels` + ``ef.evaluation.ndcg_at_k``), the named-set FP-rate / + hard-positive recall@k (when ``named_sets`` is given), and the rank+score of + every named FP/FN id — so a change's effect is quantified *per package*. + ``probes`` supplies the per-theme query text (normally loaded from the JSONL + ``__meta__`` via :func:`read_package_meta`). The first label is the baseline + that :meth:`ComparisonReport.regressions` compares against. Each ranking is + fetched once to depth ``rank_depth`` (deep enough that even a buried named id + gets a real rank) and sliced at ``k`` for the ``@k`` metrics. + """ + from ef.evaluation import ndcg_at_k + + labels = tuple(corpora) + if not labels: + raise ValueError("compare_indexings needs at least one corpus") + if rank_depth < k: + raise ValueError(f"rank_depth ({rank_depth}) must be >= k ({k})") + missing_probes = [theme for theme in themes if theme not in probes] + if missing_probes: + raise ValueError(f"no probe text for theme(s): {missing_probes}") + metrics: dict[str, dict[str, dict[str, float]]] = {} + deltas: dict[str, dict[str, dict]] = {theme: {} for theme in themes} + for label in labels: + corpus = _as_corpus(corpora[label]) + metrics[label] = {} + for theme in themes: + hits = _probe_hits( + corpus, probes[theme], mode=mode, k=rank_depth, surfaces=surfaces, + **search_kw, + ) + ranking = [h.artifact_id for h in hits] + score_by = {h.artifact_id: float(h.score) for h in hits} + rank_by = {aid: i + 1 for i, aid in enumerate(ranking)} + relevant = { + c.artifact_id: c.gain(theme, gains=gains) + for c in cases + if c.gain(theme, gains=gains) > 0 + } + row: dict[str, float] = {"ndcg": ndcg_at_k(ranking, relevant, k)} + ns = named_sets.get(theme) if named_sets else None + if ns is not None: + row["fp_rate"] = fp_rate_on_distractors(ranking, ns.distractors, k=k) + row["hard_positive_recall"] = recall_on_hard_positives( + ranking, ns.hard_positives, k=k + ) + roles = { + **{d: "distractor" for d in ns.distractors}, + **{h: "hard_positive" for h in ns.hard_positives}, + } + for aid, role in roles.items(): + entry = deltas[theme].setdefault( + aid, {"role": role, "by_label": {}} + ) + entry["by_label"][label] = { + "rank": rank_by.get(aid, 0), + "score": score_by.get(aid), + } + metrics[label][theme] = row + return ComparisonReport( + k=k, + themes=tuple(themes), + labels=labels, + baseline=labels[0], + metrics=metrics, + deltas=deltas, + ) diff --git a/tests/fixtures/package_relevance_fixture.jsonl b/tests/fixtures/package_relevance_fixture.jsonl new file mode 100644 index 0000000..3a8f17b --- /dev/null +++ b/tests/fixtures/package_relevance_fixture.jsonl @@ -0,0 +1,13 @@ +{"__meta__": {"description": "Public-name-only fixture for the graded package-relevance harness (issue #66). No private package names. Mirrors the real failure shapes: thin-description hard positives + modality/pipeline distractors.", "probes": {"embeddings": "embedding semantic vector text similarity search", "graphs": "graph network node edge directed"}, "corpus_signature": "fixture-not-pinned"}} +{"artifact_id": "sentence-transformers", "labels": {"embeddings": "core"}, "evidence": {"embeddings": "sentence/text embedding models"}, "observed": {"embeddings": 0.60}} +{"artifact_id": "chromadb", "labels": {"embeddings": "core"}, "evidence": {"embeddings": "vector database for embeddings"}, "observed": {"embeddings": 0.55}} +{"artifact_id": "transformers", "labels": {"embeddings": "strong"}, "evidence": {"embeddings": "transformer language models, text embeddings"}, "observed": {"embeddings": 0.50}} +{"artifact_id": "torch", "labels": {"embeddings": "uses-tools"}, "evidence": {"embeddings": "tensor backend other libs embed with"}, "observed": {"embeddings": 0.45}} +{"artifact_id": "sklearn", "labels": {"embeddings": "uses-tools"}, "evidence": {"embeddings": "feature vectorizers used as a means"}, "observed": {"embeddings": 0.42}} +{"artifact_id": "fasttext", "labels": {"embeddings": "core"}, "evidence": {"embeddings": "word embedding library"}, "observed": {"embeddings": 0.05}, "thin_description": true} +{"artifact_id": "librosa", "labels": {"embeddings": "none"}, "evidence": {"embeddings": "audio/signal DSP - a modality false positive"}, "observed": {"embeddings": 0.46}} +{"artifact_id": "networkx", "labels": {"graphs": "core"}, "evidence": {"graphs": "graph data structures and algorithms"}, "observed": {"graphs": 0.60}} +{"artifact_id": "igraph", "labels": {"graphs": "core"}, "evidence": {"graphs": "graph/network analysis"}, "observed": {"graphs": 0.55}} +{"artifact_id": "graphviz", "labels": {"graphs": "strong"}, "evidence": {"graphs": "graph rendering"}, "observed": {"graphs": 0.50}} +{"artifact_id": "kroki", "labels": {"graphs": "strong"}, "evidence": {"graphs": "renders graphviz/mermaid digraphs"}, "observed": {"graphs": 0.10}, "thin_description": true} +{"artifact_id": "airflow", "labels": {"graphs": "none"}, "evidence": {"graphs": "pipeline/DAG scheduler - a flow false positive"}, "observed": {"graphs": 0.44}} diff --git a/tests/test_package_relevance.py b/tests/test_package_relevance.py new file mode 100644 index 0000000..d75ef63 --- /dev/null +++ b/tests/test_package_relevance.py @@ -0,0 +1,316 @@ +"""Tests for the graded package-relevance harness (``ir.eval``, issue #66). + +Hermetic: the public-name-only fixture +(``tests/fixtures/package_relevance_fixture.jsonl``) plus the light, numpy-only +embedder — no private package names, no network, no model download. Covers the +schema round-trip, graded qrels, the named-set metric functions, the +deterministic ``derive_named_sets`` derivation, the ``compare_indexings`` +A/B gate, and the ``regressions`` gate logic. +""" + +from pathlib import Path + +import pytest + +import ir +from ir import eval as ev +from ir.store import CorpusStore + +FIXTURE = Path(__file__).parent / "fixtures" / "package_relevance_fixture.jsonl" + +# A tiny disjoint-vocab corpus whose keys match the fixture artifact ids, so the +# light (hashing) embedder ranks each theme's probe sensibly and deterministically. +DOCS = { + "sentence-transformers": "sentence embedding semantic vector text similarity model", + "chromadb": "vector embedding similarity search store database", + "transformers": "transformer language model text embedding nlp tokenizer", + "torch": "tensor deep learning autograd gpu training backend", + "sklearn": "feature vectorizer machine learning classifier regression", + "fasttext": "fasttext word embedding subword vectors", + "librosa": "audio sound waveform spectrogram signal music", + "networkx": "graph network node edge directed algorithms", + "igraph": "graph vertex edge community network analysis", + "graphviz": "graph visualization dot digraph node edge layout", + "kroki": "diagram digraph mermaid rendering node edge", + "airflow": "workflow pipeline scheduler tasks orchestration cron", +} +THEMES = ("embeddings", "graphs") + + +def _corpus(): + src = ir.CorpusSource.from_mapping(DOCS, name="pkgfix", strategy=ir.WholeText()) + return ir.build(src, store=CorpusStore.memory(), embedder="light") + + +def _cases(): + return ev.load_package_cases(FIXTURE) + + +def _probes(): + return ev.read_package_meta(FIXTURE)["probes"] + + +# --------------------------------------------------------------------------- # +# Schema + (de)serialization +# --------------------------------------------------------------------------- # + + +def test_package_case_roundtrip(tmp_path): + import json + + cases = [ + ev.PackageRelevanceCase( + "a", + labels={"embeddings": "core", "graphs": "none"}, + evidence={"embeddings": "why"}, + observed={"embeddings": 0.5}, + thin_description=True, + metadata={"d": 1}, + ), + ev.PackageRelevanceCase("b", labels={"graphs": "strong"}), + ] + path = tmp_path / "cases.jsonl" + ev.save_package_cases(cases, path, meta={"probes": {"embeddings": "x"}}) + loaded = ev.load_package_cases(path) + assert loaded == cases # frozen dataclasses compare by value + header = json.loads(path.read_text(encoding="utf-8").splitlines()[0]) + assert header == {"__meta__": {"probes": {"embeddings": "x"}}} + assert ev.read_package_meta(path) == {"probes": {"embeddings": "x"}} + + +def test_from_dict_rejects_unknown_level(): + with pytest.raises(ValueError, match="unknown relevance level"): + ev.PackageRelevanceCase.from_dict( + {"artifact_id": "a", "labels": {"embeddings": "kinda"}} + ) + + +def test_fixture_loads_and_is_public_only(): + cases = _cases() + assert len(cases) == 12 + # Sanity: the fixture must not leak private package names — only public ones. + assert {c.artifact_id for c in cases} == set(DOCS) + + +def test_level_and_gain(): + case = ev.PackageRelevanceCase("x", labels={"embeddings": "core"}) + assert case.level("embeddings") == "core" + assert case.level("graphs") == "none" # unlabeled -> none + assert case.gain("embeddings") == 3.0 + assert case.gain("graphs") == 0.0 + + +# --------------------------------------------------------------------------- # +# Graded qrels +# --------------------------------------------------------------------------- # + + +def test_to_graded_qrels_uses_level_gains(): + cases = _cases() + queries, qrels = ev.to_graded_qrels(cases, "graphs", probe="graph network") + qid = "theme:graphs" + assert queries == {qid: "graph network"} + grades = qrels[qid] + # core=3 (networkx, igraph), strong=2 (graphviz, kroki); none/airflow omitted. + assert grades["networkx"] == 3.0 + assert grades["igraph"] == 3.0 + assert grades["graphviz"] == 2.0 + assert grades["kroki"] == 2.0 + assert "airflow" not in grades # none -> not judged positive + # grade histogram reproduces the known group sizes + from collections import Counter + + assert Counter(grades.values()) == {3.0: 2, 2.0: 2} + + +def test_level_histogram_covers_all_levels(): + hist = ev.level_histogram(_cases(), "embeddings") + assert set(hist) == set(ev.RELEVANCE_LEVELS) + assert hist["core"] == 3 # sentence-transformers, chromadb, fasttext + assert hist["strong"] == 1 + assert hist["uses-tools"] == 2 + + +# --------------------------------------------------------------------------- # +# Named-set metrics (pure functions) + derivation +# --------------------------------------------------------------------------- # + + +def test_fp_rate_and_recall_pure(): + ranking = ["a", "b", "c", "d", "e"] + # 1 of 2 distractors in top-3 + assert ev.fp_rate_on_distractors(ranking, ["b", "z"], k=3) == 0.5 + # 2 of 2 hard positives in top-5, 1 of 2 in top-2 + assert ev.recall_on_hard_positives(ranking, ["a", "d"], k=5) == 1.0 + assert ev.recall_on_hard_positives(ranking, ["a", "d"], k=2) == 0.5 + # empty named set -> 0.0, never divide by zero + assert ev.fp_rate_on_distractors(ranking, [], k=3) == 0.0 + assert ev.recall_on_hard_positives(ranking, [], k=3) == 0.0 + + +def test_derive_named_sets_is_deterministic(): + cases = _cases() + emb = ev.derive_named_sets(cases, "embeddings", observed_floor=0.4) + # librosa (none, observed 0.46 >= 0.4) is a distractor; uses-tools are positive. + assert emb.distractors == ("librosa",) + # fasttext (core + thin_description) is the hard positive. + assert emb.hard_positives == ("fasttext",) + gr = ev.derive_named_sets(cases, "graphs", observed_floor=0.4) + assert gr.distractors == ("airflow",) + assert gr.hard_positives == ("kroki",) + assert ev.NamedSets.from_dict(gr.to_dict()) == gr # round-trips + + +# --------------------------------------------------------------------------- # +# evaluate_named_sets + compare_indexings (real tiny corpus, light embedder) +# --------------------------------------------------------------------------- # + + +def test_evaluate_named_sets_is_auditable(): + corpus = _corpus() + ns = ev.derive_named_sets(_cases(), "graphs", observed_floor=0.4) + report = ev.evaluate_named_sets( + corpus, ns, "graphs", probe=_probes()["graphs"], mode="dense", k=5 + ) + assert 0.0 <= report.fp_rate <= 1.0 + assert 0.0 <= report.hard_positive_recall <= 1.0 + # every number is auditable back to specific packages + assert set(report.distractors_seen) <= set(ns.distractors) + assert set(report.hard_positives_missed) <= set(ns.hard_positives) + + +def test_compare_indexings_self_has_no_regression(): + import json + + corpus = _corpus() + cases = _cases() + named = {t: ev.derive_named_sets(cases, t, observed_floor=0.4) for t in THEMES} + report = ev.compare_indexings( + {"baseline": corpus, "candidate": corpus}, + cases, + themes=THEMES, + probes=_probes(), + named_sets=named, + mode="dense", + k=5, + ) + # identical corpora => no named-id moved => zero regressions + assert report.regressions() == [] + # graded nDCG present for both labels and themes; a real number in [0, 1] + for label in ("baseline", "candidate"): + for theme in THEMES: + ndcg = report.metrics[label][theme]["ndcg"] + assert 0.0 <= ndcg <= 1.0 + assert "fp_rate" in report.metrics[label][theme] + assert "hard_positive_recall" in report.metrics[label][theme] + # to_dict is JSON-clean (the qh / HTTP surface) + assert json.loads(json.dumps(report.to_dict()))["baseline"] == "baseline" + + +def test_regressions_flags_dropped_positive_and_risen_distractor(): + # Hand-built report: a hard positive dropped (rank 3 -> 30) and a distractor + # rose (rank 40 -> 2). Both must be flagged; an unchanged id must not. + report = ev.ComparisonReport( + k=20, + themes=("graphs",), + labels=("baseline", "candidate"), + baseline="baseline", + metrics={ + "baseline": {"graphs": {"ndcg": 0.5}}, + "candidate": {"graphs": {"ndcg": 0.4}}, + }, + deltas={ + "graphs": { + "kroki": { + "role": "hard_positive", + "by_label": { + "baseline": {"rank": 3, "score": 0.4}, + "candidate": {"rank": 30, "score": 0.1}, + }, + }, + "airflow": { + "role": "distractor", + "by_label": { + "baseline": {"rank": 40, "score": 0.1}, + "candidate": {"rank": 2, "score": 0.5}, + }, + }, + "networkx": { + "role": "hard_positive", + "by_label": { + "baseline": {"rank": 1, "score": 0.9}, + "candidate": {"rank": 1, "score": 0.9}, + }, + }, + } + }, + ) + regs = report.regressions() + flagged = {(r["artifact_id"], r["role"]) for r in regs} + assert ("kroki", "hard_positive") in flagged + assert ("airflow", "distractor") in flagged + assert ("networkx", "hard_positive") not in flagged + # a tolerance threshold suppresses small moves + assert report.regressions(threshold=100) == [] + # absent/0 ranks are reported as None (1-based ranks: 0 would read as "top") + report2 = ev.ComparisonReport( + k=20, themes=("graphs",), labels=("baseline", "candidate"), + baseline="baseline", + metrics={"baseline": {"graphs": {"ndcg": 0.5}}, + "candidate": {"graphs": {"ndcg": 0.4}}}, + deltas={"graphs": {"kroki": {"role": "hard_positive", "by_label": { + "baseline": {"rank": 3, "score": 0.4}, + "candidate": {"rank": 0, "score": None}}}}}, # vanished + ) + assert report2.regressions()[0]["candidate_rank"] is None + + +# --------------------------------------------------------------------------- # +# Hardening (review follow-ups): defensive defaults & input guards +# --------------------------------------------------------------------------- # + + +def test_gain_with_partial_gains_defaults_to_zero(): + case = ev.PackageRelevanceCase("x", labels={"embeddings": "none"}) + # a partial gains mapping omitting "none" must not KeyError + assert case.gain("embeddings", gains={"core": 3.0}) == 0.0 + + +def test_derive_named_sets_honors_cached_is_distractor(): + cases = [ + # cached True overrides the observed-floor rule (here observed is absent) + ev.PackageRelevanceCase("x", labels={"embeddings": "none"}, + is_distractor={"embeddings": True}), + # cached False suppresses what the floor rule would otherwise flag + ev.PackageRelevanceCase("y", labels={"embeddings": "none"}, + observed={"embeddings": 0.9}, + is_distractor={"embeddings": False}), + ] + ns = ev.derive_named_sets(cases, "embeddings", observed_floor=0.4) + assert ns.distractors == ("x",) + + +def test_named_set_rates_dedup_ids(): + ranking = ["a", "b", "c"] + # "b" listed twice must count once (rate 1/1, not 2/2 miscount or 1/2) + assert ev.fp_rate_on_distractors(ranking, ["b", "b"], k=3) == 1.0 + assert ev.recall_on_hard_positives(ranking, ["z", "z"], k=3) == 0.0 + + +def test_compare_indexings_guards_bad_args(): + corpus = _corpus() + cases = _cases() + with pytest.raises(ValueError, match="rank_depth"): + ev.compare_indexings({"b": corpus}, cases, themes=("graphs",), + probes=_probes(), k=20, rank_depth=5) + with pytest.raises(ValueError, match="no probe text"): + ev.compare_indexings({"b": corpus}, cases, themes=("graphs", "missing"), + probes={"graphs": "g"}, k=5, rank_depth=50) + + +def test_evaluate_named_sets_theme_defaults_to_named_sets_theme(): + corpus = _corpus() + ns = ev.derive_named_sets(_cases(), "graphs", observed_floor=0.4) + report = ev.evaluate_named_sets(corpus, ns, probe=_probes()["graphs"], + mode="dense", k=5) + assert report.theme == "graphs"