From aac3d832a323fe557c178534b54372f784fd5974 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Victor=20Gonz=C3=A1lez-Pacheco?= Date: Sun, 24 May 2026 19:17:19 +0200 Subject: [PATCH 01/11] [#34] add MediaPhotoDescribed variant and discriminator updates Introduce a fifth media variant carrying the vision-LLM output: is_decorative + description + description_lang + description_version + described_at, on top of the downloaded-state fields. The discriminated union grows from four to five `kind`s; the path-traversal validator carries over verbatim, since the on-disk bytes are inherited from the prior downloaded state. Update every isinstance cascade that walked the union (media.py eligibility, generate.py rendering + mirror, diff.py counting) to handle the new variant explicitly so `assert_never` keeps the exhaustiveness check honest. The diff report grows a `described` column and a `delta_described` field; the count is exposed through both the text and JSON renderers. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/xbrain/diff.py | 30 ++++++++--- src/xbrain/generate.py | 21 +++++--- src/xbrain/media.py | 43 ++++++++++++--- src/xbrain/models.py | 81 ++++++++++++++++++++++++++-- tests/test_models.py | 116 +++++++++++++++++++++++++++++++++++++++++ 5 files changed, 264 insertions(+), 27 deletions(-) diff --git a/src/xbrain/diff.py b/src/xbrain/diff.py index f41715d..3774097 100644 --- a/src/xbrain/diff.py +++ b/src/xbrain/diff.py @@ -32,6 +32,7 @@ from xbrain.models import ( Item, + MediaPhotoDescribed, MediaPhotoDownloaded, MediaPhotoFailed, MediaPhotoPending, @@ -132,13 +133,15 @@ class VocabDiff(BaseModel): class MediaStateCounts(BaseModel): """Counts of media variants on one side of a diff. - Mirrors the four-variant union: downloaded / pending / failed - / video_pending. Counts are global across all items in the store — - per-item resolution lives on the items themselves (already - diff-able via the items round-trip). + Mirrors the five-variant union: downloaded / described / pending / + failed / video_pending. Counts are global across all items in the + store — per-item resolution lives on the items themselves (already + diff-able via the items round-trip). `described` is the terminal + state for content-bearing photos after a vision-describe pass. """ downloaded: int = 0 + described: int = 0 pending: int = 0 failed: int = 0 video_pending: int = 0 @@ -149,14 +152,18 @@ class MediaDiff(BaseModel): `delta_downloaded = b.downloaded - a.downloaded` answers the operator's first question after `xbrain media`: "how many new photos - landed?" Same for the other three variants. Negative deltas are - legal — `xbrain media --force` can move a previously-downloaded - photo back to `failed` if the URL turned 404 in the meantime. + landed?" `delta_described` answers the analogous question after + `xbrain describe`: "how many photos transitioned from downloaded to + described?" Negative deltas are legal — `--force` runs can move + photos backwards along the pipeline (e.g. a `describe --force` + behind a CDN change drops the previous description; a + `media --force` re-download collapses described back to downloaded). """ a: MediaStateCounts = Field(default_factory=MediaStateCounts) b: MediaStateCounts = Field(default_factory=MediaStateCounts) delta_downloaded: int = 0 + delta_described: int = 0 delta_pending: int = 0 delta_failed: int = 0 delta_video_pending: int = 0 @@ -176,6 +183,7 @@ class DiffSummary(BaseModel): topic_pages_a: int topic_pages_b: int media_delta_downloaded: int = 0 + media_delta_described: int = 0 media_delta_failed: int = 0 @@ -271,6 +279,7 @@ def _load_or_explain(path: Path, loader): topic_pages_a=len(pages_a), topic_pages_b=len(pages_b), media_delta_downloaded=media_diff.delta_downloaded, + media_delta_described=media_diff.delta_described, media_delta_failed=media_diff.delta_failed, ) return DiffReport( @@ -400,6 +409,7 @@ def _compute_media_diff( a=counts_a, b=counts_b, delta_downloaded=counts_b.downloaded - counts_a.downloaded, + delta_described=counts_b.described - counts_a.described, delta_pending=counts_b.pending - counts_a.pending, delta_failed=counts_b.failed - counts_a.failed, delta_video_pending=counts_b.video_pending - counts_a.video_pending, @@ -407,12 +417,14 @@ def _compute_media_diff( def _count_media_variants(items: dict[str, Item]) -> MediaStateCounts: - """Tally the four media variants across every item in the store.""" + """Tally the five media variants across every item in the store.""" counts = MediaStateCounts() for item in items.values(): for entry in item.media: if isinstance(entry, MediaPhotoDownloaded): counts.downloaded += 1 + elif isinstance(entry, MediaPhotoDescribed): + counts.described += 1 elif isinstance(entry, MediaPhotoPending): counts.pending += 1 elif isinstance(entry, MediaPhotoFailed): @@ -528,6 +540,8 @@ def _format_media_block(media: MediaDiff) -> list[str]: return [ f" downloaded: A={media.a.downloaded:>5} B={media.b.downloaded:>5} " f"({_format_delta(media.delta_downloaded)})", + f" described: A={media.a.described:>5} B={media.b.described:>5} " + f"({_format_delta(media.delta_described)})", f" pending: A={media.a.pending:>5} B={media.b.pending:>5} " f"({_format_delta(media.delta_pending)})", f" failed: A={media.a.failed:>5} B={media.b.failed:>5} " diff --git a/src/xbrain/generate.py b/src/xbrain/generate.py index 92bf149..83d6574 100644 --- a/src/xbrain/generate.py +++ b/src/xbrain/generate.py @@ -16,6 +16,7 @@ ContentSourceSuccess, FailureReason, Item, + MediaPhotoDescribed, MediaPhotoDownloaded, MediaPhotoFailed, MediaPhotoPending, @@ -196,10 +197,16 @@ def _render_media_lines(item: Item) -> list[str]: """One line per `Item.media` entry, ready to splice into the Tweet section. Variant handling: - - `MediaPhotoDownloaded` → Obsidian embed `![[_media//.]]`. - The vault is self-contained: `generate()` mirrors the file from - `data/media/` into `/_media/` before rendering, so the - embed resolves with no user configuration. + - `MediaPhotoDownloaded` / `MediaPhotoDescribed` → Obsidian embed + `![[_media//.]]`. The vault is self-contained: + `generate()` mirrors the file from `data/media/` into + `/_media/` before rendering, so the embed resolves + with no user configuration. The described variant inherits the + same on-disk file — the description is consumed by the LLM + prompts in `executors/api.py` / `topic_synth.py`, NOT shown as + alt-text in this phase. Decorative photos are still embedded; + the `is_decorative` flag only filters them out of the LLM prompts, + never out of the visual rendering. - `MediaPhotoFailed` → one-line ⚠ warning carrying the failure reason and the original URL — visible evidence, not a silent drop. - `MediaPhotoPending` → silent. Not an error, just "the next @@ -213,7 +220,7 @@ def _render_media_lines(item: Item) -> list[str]: """ lines: list[str] = [] for entry in item.media: - if isinstance(entry, MediaPhotoDownloaded): + if isinstance(entry, (MediaPhotoDownloaded, MediaPhotoDescribed)): lines.append(f"![[{_VAULT_MEDIA_SUBDIR}/{entry.local_path}]]") elif isinstance(entry, MediaPhotoFailed): reason = _FAILURE_ES_MEDIA.get(entry.failure_reason, entry.failure_reason) @@ -259,7 +266,9 @@ def _mirror_item_media(item: Item, media_root: Path, vault_media_dir: Path) -> N """ vault_media_dir.mkdir(parents=True, exist_ok=True) for entry in item.media: - if not isinstance(entry, MediaPhotoDownloaded): + # The described variant inherits the on-disk bytes from the + # prior downloaded state — both shapes hit the same mirror path. + if not isinstance(entry, (MediaPhotoDownloaded, MediaPhotoDescribed)): continue source = media_root / entry.local_path destination = vault_media_dir / entry.local_path diff --git a/src/xbrain/media.py b/src/xbrain/media.py index 67aaa57..c52322e 100644 --- a/src/xbrain/media.py +++ b/src/xbrain/media.py @@ -35,6 +35,7 @@ Item, MediaEntry, MediaFailureReason, + MediaPhotoDescribed, MediaPhotoDownloaded, MediaPhotoFailed, MediaPhotoPending, @@ -210,12 +211,21 @@ def download_all( def _is_eligible(entry: MediaEntry, *, force: bool) -> bool: - """Decide whether `download_all` should attempt this entry on THIS run.""" + """Decide whether `download_all` should attempt this entry on THIS run. + + The described variant inherits the downloaded contract: a re-download + only happens under `--force`. A `--force` re-download drops the + description (the entry collapses back to `MediaPhotoDownloaded`) — + `xbrain describe` is the path that re-adds it. Forcing the bytes + without re-describing is the rare case (e.g. the X CDN replaced the + asset); the operator is expected to follow with `xbrain describe + --force` if the new bytes warrant it. + """ if isinstance(entry, MediaVideoPending): return False if isinstance(entry, MediaPhotoPending): return True - if isinstance(entry, MediaPhotoDownloaded): + if isinstance(entry, (MediaPhotoDownloaded, MediaPhotoDescribed)): return force if isinstance(entry, MediaPhotoFailed): if force: @@ -243,15 +253,24 @@ def _iter_eligible_attempts( limit: int | None, force: bool, report: MediaReport, -) -> Iterator[tuple[str, Item, int, MediaPhotoPending | MediaPhotoFailed | MediaPhotoDownloaded]]: +) -> Iterator[ + tuple[ + str, + Item, + int, + MediaPhotoPending | MediaPhotoFailed | MediaPhotoDownloaded | MediaPhotoDescribed, + ] +]: """Yield each (item_id, item, index, entry) pair eligible for download. Encapsulates the empty-media skip + per-entry eligibility cascade + global limit countdown that `download_all` would otherwise interleave with the download orchestration. Side effects on `report`: bumps `items_processed` once per item that has media, and - `photos_skipped_already_downloaded` once per Downloaded entry passed - over (without `--force`). Stops yielding once `limit` is exhausted. + `photos_skipped_already_downloaded` once per Downloaded / Described + entry passed over (without `--force`) — both share the + "bytes-already-on-disk" semantics from the downloader's perspective. + Stops yielding once `limit` is exhausted. """ remaining = limit for item_id, item in items.items(): @@ -262,11 +281,19 @@ def _iter_eligible_attempts( if remaining is not None and remaining <= 0: return if not _is_eligible(entry, force=force): - if isinstance(entry, MediaPhotoDownloaded): + if isinstance(entry, (MediaPhotoDownloaded, MediaPhotoDescribed)): report.photos_skipped_already_downloaded += 1 continue # `_is_eligible` already excluded `MediaVideoPending`; narrow for mypy. - assert isinstance(entry, (MediaPhotoPending, MediaPhotoFailed, MediaPhotoDownloaded)) + assert isinstance( + entry, + ( + MediaPhotoPending, + MediaPhotoFailed, + MediaPhotoDownloaded, + MediaPhotoDescribed, + ), + ) if remaining is not None: remaining -= 1 yield item_id, item, index, entry @@ -305,7 +332,7 @@ def _record_outcome( def _download_one( - entry: MediaPhotoPending | MediaPhotoFailed | MediaPhotoDownloaded, + entry: MediaPhotoPending | MediaPhotoFailed | MediaPhotoDownloaded | MediaPhotoDescribed, *, item_id: str, index: int, diff --git a/src/xbrain/models.py b/src/xbrain/models.py index 13296ce..b018b1e 100644 --- a/src/xbrain/models.py +++ b/src/xbrain/models.py @@ -156,6 +156,62 @@ class MediaPhotoFailed(_MediaPhotoBase): last_attempt_at: datetime +class MediaPhotoDescribed(_MediaPhotoBase): + """A downloaded photo that has also been described by a vision LLM. + + The terminal state for a content-bearing photo: the bytes still live + at `local_path` (every `MediaPhotoDownloaded` invariant carries over) + AND a vision pass has classified the image and written a short prose + description. Decorative photos (avatars, reaction memes, abstract + backgrounds) reach this variant too — they carry `is_decorative=True` + and an empty `description`, so downstream callers can filter them out + without re-classifying. + + `description_version` is the rubric/version tag the description was + produced under. Bumping the configured version invalidates existing + entries: the next `xbrain describe` run treats them as stale and + re-describes (no `--force` needed). `description_lang` records the + language the prose was written in so a vault that switches + `output_language` mid-corpus can be re-described selectively. + + `description` is empty for decorative photos by contract — the + vision rubric returns an empty string in that case and a refusal + (faces, NSFW) is also bucketed as decorative with empty description. + No `gt=0` constraint here for that reason. + """ + + kind: Literal["photo_described"] = "photo_described" + local_path: str + width: int = Field(gt=0) + height: int = Field(gt=0) + bytes_size: int = Field(gt=0) + downloaded_at: datetime + is_decorative: bool + description: str + description_lang: str + description_version: str + described_at: datetime + + @field_validator("local_path") + @classmethod + def _reject_path_traversal(cls, value: str) -> str: + """Reject absolute paths and `..` components. + + Mirrors `MediaPhotoDownloaded._reject_path_traversal` — the + described variant inherits the same on-disk path contract because + the bytes from the prior Downloaded state are still referenced + verbatim. Persisted `items.json` is on-disk plain JSON the user + can edit, so the defence in depth is reapplied here. + """ + _ = cls + if value.startswith("/") or value.startswith("\\"): + raise ValueError(f"local_path must be relative, got {value!r}") + for part in value.replace("\\", "/").split("/"): + if part == "..": + raise ValueError(f"local_path must not contain '..' components: {value!r}") + return value + + class MediaVideoPending(BaseModel): """A video URL captured but not downloaded. @@ -181,9 +237,12 @@ def _normalise_legacy_media(value: Any) -> Any: Records that already carry a ``kind`` field are passed through unchanged — they are either fresh (extract-time) or already in the - tagged-union shape. A record with neither `kind` nor a recognised - `type` is passed through unchanged so pydantic raises a clean - discriminator error rather than this validator inventing a state. + tagged-union shape (`photo_pending` / `photo_downloaded` / + `photo_failed` / `photo_described` / `video_pending`). A record with + neither `kind` nor a recognised `type` is passed through unchanged + so pydantic raises a clean discriminator error rather than this + validator inventing a state. The described variant has no legacy + shape — it can only originate from a vision-describe run. """ if not isinstance(value, dict): return value @@ -203,7 +262,13 @@ def _normalise_legacy_media(value: Any) -> Any: # (see the long comment above): the discriminator check must run AFTER # the legacy normaliser, otherwise legacy records get rejected. _MediaTagged = Annotated[ - Union[MediaPhotoPending, MediaPhotoDownloaded, MediaPhotoFailed, MediaVideoPending], + Union[ + MediaPhotoPending, + MediaPhotoDownloaded, + MediaPhotoFailed, + MediaPhotoDescribed, + MediaVideoPending, + ], Field(discriminator="kind"), ] MediaEntry = Annotated[ @@ -215,7 +280,13 @@ def _normalise_legacy_media(value: Any) -> Any: # TypeAdapter for tests / ad-hoc validation of a single entry outside an # `Item` context (mirrors `ContentSourceAdapter`). MediaEntryAdapter: TypeAdapter[ - Union[MediaPhotoPending, MediaPhotoDownloaded, MediaPhotoFailed, MediaVideoPending] + Union[ + MediaPhotoPending, + MediaPhotoDownloaded, + MediaPhotoFailed, + MediaPhotoDescribed, + MediaVideoPending, + ] ] = TypeAdapter(MediaEntry) diff --git a/tests/test_models.py b/tests/test_models.py index 7b0bc84..8df67c9 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -292,6 +292,122 @@ def test_media_new_tagged_shape_passes_through(): assert entry.width == 1200 +def test_media_photo_described_round_trips_through_json(): + """A `MediaPhotoDescribed` payload reads back as the same variant. + + Exercises the discriminator path: the new `photo_described` kind + must match exactly one variant. A round-trip dump must NOT collapse + back to `MediaPhotoDownloaded` (which carries the same on-disk + fields minus the description payload). + """ + from datetime import datetime, timezone + + from xbrain.models import MediaEntryAdapter, MediaPhotoDescribed + + payload = { + "kind": "photo_described", + "type": "photo", + "url": "https://pbs.twimg.com/media/X.jpg", + "local_path": "123/0.jpg", + "width": 1200, + "height": 800, + "bytes_size": 99000, + "downloaded_at": datetime(2026, 5, 24, tzinfo=timezone.utc).isoformat(), + "is_decorative": False, + "description": "A chart showing model accuracy by parameter count.", + "description_lang": "English", + "description_version": "v1", + "described_at": datetime(2026, 5, 24, 12, tzinfo=timezone.utc).isoformat(), + } + entry = MediaEntryAdapter.validate_python(payload) + assert isinstance(entry, MediaPhotoDescribed) + assert entry.is_decorative is False + assert entry.description.startswith("A chart") + assert entry.description_lang == "English" + assert entry.description_version == "v1" + # Re-dump and re-parse: variant must survive verbatim. + restored = MediaEntryAdapter.validate_python(entry.model_dump(mode="json")) + assert isinstance(restored, MediaPhotoDescribed) + assert restored == entry + + +def test_media_photo_described_decorative_carries_empty_description(): + """Decorative entries store an empty description by contract. + + The vision rubric returns empty for decorative; refusals (faces, NSFW) + are bucketed as decorative + empty. The model accepts this explicitly + — there is no `gt=0` length constraint on `description`. + """ + from datetime import datetime, timezone + + from xbrain.models import MediaPhotoDescribed + + entry = MediaPhotoDescribed( + url="https://pbs.twimg.com/media/X.jpg", + local_path="123/0.jpg", + width=400, + height=400, + bytes_size=12000, + downloaded_at=datetime(2026, 5, 24, tzinfo=timezone.utc), + is_decorative=True, + description="", + description_lang="English", + description_version="v1", + described_at=datetime(2026, 5, 24, 12, tzinfo=timezone.utc), + ) + assert entry.is_decorative is True + assert entry.description == "" + + +def test_media_photo_described_rejects_absolute_local_path(): + """Path-traversal defence carries over from the downloaded variant — + the bytes referenced by `local_path` are inherited from the prior + state, so the same hardening applies. + """ + import pytest + from pydantic import ValidationError + + from xbrain.models import MediaPhotoDescribed + + with pytest.raises(ValidationError): + MediaPhotoDescribed( + url="https://pbs.twimg.com/media/X.jpg", + local_path="/etc/passwd", + width=1, + height=1, + bytes_size=1, + downloaded_at=datetime(2026, 5, 24, tzinfo=timezone.utc), + is_decorative=False, + description="x", + description_lang="English", + description_version="v1", + described_at=datetime(2026, 5, 24, tzinfo=timezone.utc), + ) + + +def test_media_photo_described_rejects_parent_traversal_local_path(): + """Same defence as the downloaded variant — `..` in `local_path` is rejected.""" + import pytest + from pydantic import ValidationError + + from xbrain.models import MediaPhotoDescribed + + with pytest.raises(ValidationError): + MediaPhotoDescribed( + url="https://pbs.twimg.com/media/X.jpg", + local_path="../escape/x.jpg", + width=1, + height=1, + bytes_size=1, + downloaded_at=datetime(2026, 5, 24, tzinfo=timezone.utc), + is_decorative=False, + description="x", + description_lang="English", + description_version="v1", + described_at=datetime(2026, 5, 24, tzinfo=timezone.utc), + ) + + def test_media_discriminator_rejects_unknown_kind(): """Silently inventing a variant would mask data corruption — reject loudly.""" import pytest From deb094dbb6ed8e36835743fc838b59be7c48bb1d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Victor=20Gonz=C3=A1lez-Pacheco?= Date: Sun, 24 May 2026 19:18:18 +0200 Subject: [PATCH 02/11] [#34] add describe-image rubric with {language} placeholder MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The vision-describe call ships a declarative system prompt — same shape as the other rubrics. It enumerates the decorative-vs-content classification, the per-image description contract (1-3 sentences, plain prose, faithful), the refusal fallback (decorative + empty description), and the exact JSON output shape (list of {index, is_decorative, description}). The `{language}` placeholder follows the existing rubric convention so `output_language` propagates into vision calls the same way it already propagates into summary/topic-page rubrics. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/xbrain/rubrics/rubric-describe-image.md | 72 +++++++++++++++++++++ tests/test_rubrics.py | 21 ++++++ 2 files changed, 93 insertions(+) create mode 100644 src/xbrain/rubrics/rubric-describe-image.md diff --git a/src/xbrain/rubrics/rubric-describe-image.md b/src/xbrain/rubrics/rubric-describe-image.md new file mode 100644 index 0000000..4ebf214 --- /dev/null +++ b/src/xbrain/rubrics/rubric-describe-image.md @@ -0,0 +1,72 @@ +# Rubric — Describe images + +You describe images for a personal knowledge wiki. The descriptions are +read by a downstream LLM that assigns topics and writes topic-page +overviews — they are NOT shown to the user. Write for that LLM: be +factual, dense, and short. + +- **Language:** {language}, regardless of any text visible in the image. +- **Length per description:** 1 to 3 sentences. No preamble ("This image + shows..."). No markdown, no wikilinks, no bullet characters, no quotes. +- **Faithful:** describe only what is visible. Never invent text, numbers + or names. If a chart's labels are unreadable, say so plainly rather + than guessing. + +## Classify each image + +For every image you receive, decide one of two buckets: + +- **`is_decorative: true`** when the image carries no topical content. + Avatars, profile pictures, plain reaction GIFs / memes, abstract + backgrounds, pure aesthetic stills, decorative banners, brand logos + used as ornaments. Decorative images contribute no topic signal — + the downstream LLM will skip them. +- **`is_decorative: false`** when the image conveys information. This + is the common case: screenshots of text / code / charts / diagrams / + papers / dashboards / UIs, photos of whiteboards, slides, product + shots with visible labels, data visualisations, infographics, real + scenes whose content is the point (e.g. a queue at a launch event, + a protest sign, a hardware close-up). + +When in doubt, prefer **`is_decorative: false`**: a description is +cheap, missing topic signal is not. + +## Write each description + +- For a chart: name the chart type, the axes (if labelled), the + comparison being made, and any headline number visible. Two sentences + is usually enough. +- For a screenshot of text: paraphrase the substance in your own words. + Quote a short distinctive phrase only if the verbatim wording matters + (a product name, a thesis statement). +- For a diagram: name the components and the relationships between them + in one sentence; the second sentence may add what the diagram is + arguing. +- For a photo: state what is depicted and any visible text or signage. +- **For a decorative image:** set `description` to the empty string + `""`. Do not write "decorative image" or any placeholder — the empty + string is the contract. + +## Refusals + +If you cannot describe an image (a recognisable face, NSFW, or any +content you must decline), do not raise an error: emit +`is_decorative: true` with `description: ""`. The downstream LLM will +treat the entry as a decorative no-signal photo. No special-case +handling is needed. + +## Output format + +Respond with a single JSON list, one entry per image in the order you +received them. Use `index` to disambiguate; the caller maps it back to +the input position. + +```json +[ + {"index": 0, "is_decorative": false, "description": "Line chart comparing GPT-4 and Claude on MMLU; Claude is 2 points higher."}, + {"index": 1, "is_decorative": true, "description": ""} +] +``` + +- Exactly one entry per input image. +- No extra keys, no preamble, no surrounding prose. diff --git a/tests/test_rubrics.py b/tests/test_rubrics.py index 2095d1a..688a706 100644 --- a/tests/test_rubrics.py +++ b/tests/test_rubrics.py @@ -86,3 +86,24 @@ def test_topic_page_rubric_loads(): text = load_rubric("topic-page") assert "overview" in text assert "notes" in text + + +def test_describe_image_rubric_loads_and_substitutes_language(): + """The describe-image rubric ships a `{language}` placeholder; the + loader must substitute it, and the defensive check must not trip on + correctly-spelt placeholders. + """ + text = load_rubric("describe-image", language="English") + assert "{language}" not in text + assert "English" in text + # Sanity: the contract keys must appear in the prompt so the LLM + # produces the right JSON shape. + assert "is_decorative" in text + assert "description" in text + assert "index" in text + + +def test_describe_image_rubric_preserves_placeholder_when_language_none(): + """No-language calls (tests, manual inspection) keep the literal placeholder.""" + text = load_rubric("describe-image") + assert "{language}" in text From b59d6106dce73a226130d3605c41303144649da4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Victor=20Gonz=C3=A1lez-Pacheco?= Date: Sun, 24 May 2026 19:28:12 +0200 Subject: [PATCH 03/11] [#34] add xbrain describe orchestrator + Anthropic client wiring MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The orchestrator walks every eligible photo, batches the bytes into vision-API calls (default 5 images per call), parses the per-image JSON list judgments, and transitions matched entries to MediaPhotoDescribed. Decorative classifications are persisted with an empty description so downstream consumers (enrich-time prompt, topic-synth prompt) can filter them out at a single seam. The shape mirrors xbrain.executors.api and xbrain.topic_synth: narrow recoverable-errors tuple, per-batch failure isolation, logger.warning on every failure, RuntimeError on total failure, and a SUMMARY: described/failed/skipped stderr line. Programmer bugs (AttributeError) and KeyboardInterrupt propagate as their narrow catch is on Exception subclasses only. Eligibility uses a description-version tag: bumping the configured version invalidates persisted descriptions automatically (no --force needed). The Ctrl-C-coherent invariant is delegated to an on_progress callback the CLI wires to a store write. 41 unit tests exercise idempotency, force/stale-version, batching, partial+total failure, refusal handling, language plumbing, programmer-bug propagation, KeyboardInterrupt, file-missing fallback, summary line, and clock injection — all via FakeAnthropic. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/xbrain/describe.py | 772 +++++++++++++++++++++++++++++++++ tests/test_describe.py | 953 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 1725 insertions(+) create mode 100644 src/xbrain/describe.py create mode 100644 tests/test_describe.py diff --git a/src/xbrain/describe.py b/src/xbrain/describe.py new file mode 100644 index 0000000..61f514e --- /dev/null +++ b/src/xbrain/describe.py @@ -0,0 +1,772 @@ +"""Describe downloaded photos with a vision LLM; feed descriptions into enrich. + +The `describe_all` orchestrator walks every photo entry the downloader +has produced, batches the bytes into vision-API calls (default: 5 images +per call to Claude Sonnet), parses the per-image JSON judgments, and +transitions matched entries to `MediaPhotoDescribed`. The persisted +description is consumed by the enrich-time prompt in `executors.api` +and the topic-synth prompt in `topic_synth` — decorative photos are +filtered out at that consumption seam so they introduce no topic noise. + +The structure mirrors `xbrain.executors.api` and `xbrain.topic_synth`: +a recoverable-errors tuple, per-batch failure isolation, +`logger.warning` on every failure, `RuntimeError` on total failure, and +a `SUMMARY: described: N, failed: M, skipped: K` stderr line for the +CLI. Programmer bugs (`AttributeError`) and `KeyboardInterrupt` +propagate — narrow `except` clauses only. + +I/O dependencies (the Anthropic client, the media root path) are +keyword-injectable so tests run offline. The store mutation is in +place; the caller is expected to wrap each transition with a +store-write (the `on_progress` callback fires after every batch). +""" + +from __future__ import annotations + +import base64 +import io +import json +import logging +import sys +import time +from collections.abc import Callable, Iterator +from dataclasses import dataclass, field +from datetime import datetime, timezone +from pathlib import Path + +from xbrain.models import ( + Item, + MediaPhotoDescribed, + MediaPhotoDownloaded, +) +from xbrain.rubrics import load_rubric + +logger = logging.getLogger(__name__) + +# Default per-call image count. The vision rubric is tuned for batches in +# the 1-10 range; 5 is the sweet spot the spec settled on (12-15 % token +# saving vs per-image, modest added complexity). +_DEFAULT_BATCH_SIZE = 5 + +# Token ceiling for the JSON list response. Per-image average is ~3 +# sentences ≈ 80 tokens of prose + 20 of JSON scaffolding = ~100 tokens. +# A batch of 5 fits comfortably under 600; the cap is set high enough to +# survive an over-eager model that emits long descriptions. +_MAX_TOKENS = 1200 + +# Map file extensions to the Anthropic vision media-type strings. The +# downloader writes one of these three (see `xbrain.media._FORMAT_EXTENSIONS`). +_MEDIA_TYPES: dict[str, str] = { + ".jpg": "image/jpeg", + ".jpeg": "image/jpeg", + ".png": "image/png", + ".webp": "image/webp", +} + + +def _utcnow() -> datetime: + """Default `now` clock — UTC-aware `datetime.now()`. + + Module-level (not a lambda inside `describe_all`) so tests can + monkeypatch a deterministic clock without going through the `now=` + keyword argument on every call site. + """ + return datetime.now(timezone.utc) + + +@dataclass +class DescribeReport: + """Counts emitted by `describe_all` for the CLI's SUMMARY line. + + `photos_skipped_already_described` is the idempotency proof — a + no-op re-run with no version bump must report every previously + described photo here. `batches_attempted` counts API calls actually + issued; `batches_failed` counts the ones the recoverable-errors + tuple swallowed. A run with `photos_attempted > 0 and + photos_described == 0` raises before this report leaves the + orchestrator. + """ + + items_processed: int = 0 + photos_attempted: int = 0 + photos_described: int = 0 + photos_failed: int = 0 + photos_skipped_already_described: int = 0 + batches_attempted: int = 0 + batches_failed: int = 0 + elapsed_seconds: float = 0.0 + # Per-item failures keyed by item id → list of (url, error) tuples. + # Surfaces in the verbose CLI output without re-walking the store. + per_item_failures: dict[str, list[tuple[str, str]]] = field(default_factory=dict) + + +@dataclass(frozen=True) +class _Candidate: + """One photo eligible for description on this run. + + Holds back-references to `Item` and the media-list `index` so the + orchestrator can swap the transitioned variant back into place + without re-scanning the store. `bytes_data` is loaded lazily by + `_load_bytes` — failing to read the file is a per-photo failure, + not a total-run abort. + """ + + item_id: str + item: Item + index: int + entry: MediaPhotoDownloaded | MediaPhotoDescribed + + +def _recoverable_errors() -> tuple[type[Exception], ...]: + """Exception classes a per-batch failure should swallow + log + continue on. + + Mirrors `xbrain.executors.api._recoverable_errors`. `anthropic.APIError` + covers auth, rate-limit, server-side and network errors the SDK + normalises. `ValueError` covers validator rejections (and + `pydantic.ValidationError`, a `ValueError` subclass in pydantic v2). + `json.JSONDecodeError` covers malformed LLM responses. `KeyError` + covers responses missing expected fields. `OSError` covers + file-read failures when streaming photo bytes off disk (a missing + file under `data/media/` is per-photo recoverable, not a total + abort). + + Lazy-imported because `anthropic` is optional in the test + environment (the client is faked via `tests.conftest.FakeAnthropic`). + """ + try: + from anthropic import APIError + + return (APIError, ValueError, json.JSONDecodeError, KeyError, OSError) + except ImportError: + return (ValueError, json.JSONDecodeError, KeyError, OSError) + + +def _is_stale(entry: MediaPhotoDescribed, *, current_version: str) -> bool: + """A described entry is stale when its version no longer matches the config. + + Bumping `describe_version` in `config.toml` is the manual trigger to + re-describe the whole corpus against a new rubric — no `--force` + needed. The version check is exact-string: there is no ordering + relation between versions, only equality, so a deliberate downgrade + is also a "describe again" signal. + """ + return entry.description_version != current_version + + +def _eligible( + entry: object, + *, + force: bool, + current_version: str, +) -> bool: + """Decide whether `describe_all` should attempt this entry on THIS run. + + `MediaPhotoDownloaded` entries are always eligible — they have not + been described yet by definition. `MediaPhotoDescribed` entries are + eligible only when `--force` is set or the persisted version is + stale vs the current `describe_version` config. Every other + variant (`MediaPhotoPending`, `MediaPhotoFailed`, `MediaVideoPending`) + is out of scope — describing only runs over photos whose bytes are + already on disk. + """ + if isinstance(entry, MediaPhotoDownloaded): + return True + if isinstance(entry, MediaPhotoDescribed): + if force: + return True + return _is_stale(entry, current_version=current_version) + return False + + +def _tally_skipped( + entry: object, + *, + current_version: str, + report: DescribeReport, +) -> None: + """Bump `photos_skipped_already_described` when this entry is a no-op skip. + + Pulled out of the candidate iterator so the loop body in + `_iter_candidates` keeps a low complexity grade. Only described + entries on the current version count as skips — pending/failed/ + video entries are silently out of scope for `xbrain describe`. + """ + if isinstance(entry, MediaPhotoDescribed) and not _is_stale( + entry, current_version=current_version + ): + report.photos_skipped_already_described += 1 + + +def _iter_item_candidates( + item_id: str, + item: Item, + *, + force: bool, + current_version: str, + report: DescribeReport, +) -> Iterator[_Candidate]: + """Yield every eligible candidate inside one item's media list. + + Pulled out of the cross-item iterator so the outer scan stays + flat. Per-entry tallies (skips) go on the report; the caller + decides whether the item itself counts as processed by checking + if this iterator yielded anything. + """ + for index, entry in enumerate(item.media): + if not _eligible(entry, force=force, current_version=current_version): + _tally_skipped(entry, current_version=current_version, report=report) + continue + assert isinstance(entry, (MediaPhotoDownloaded, MediaPhotoDescribed)) + yield _Candidate(item_id=item_id, item=item, index=index, entry=entry) + + +def _take_with_limit(candidates: Iterator[_Candidate], limit: int | None) -> Iterator[_Candidate]: + """Yield from `candidates`, stopping after `limit` items. + + Pulled out so `_iter_candidates` does not interleave the + limit-countdown with the per-item bookkeeping. `None` means "no + limit" — yields everything. + """ + if limit is None: + yield from candidates + return + remaining = limit + for candidate in candidates: + if remaining <= 0: + return + remaining -= 1 + yield candidate + + +def _iter_candidates( + items: dict[str, Item], + *, + force: bool, + limit: int | None, + items_filter: set[str] | None, + current_version: str, + report: DescribeReport, +) -> Iterator[_Candidate]: + """Yield each candidate eligible for description on THIS run. + + Side effects on `report`: bumps `items_processed` once per item + that contributes at least one yielded candidate, and bumps + `photos_skipped_already_described` for each `MediaPhotoDescribed` + entry passed over (via `_tally_skipped`). Stops yielding once + `limit` is exhausted. + + `items_processed` is bumped on the FIRST yielded candidate of an + item, not at the end of its scan — that way a `limit` that + truncates mid-item still counts the item as processed (work + happened on it). Items whose every photo was skipped do NOT count + as processed. + """ + seen_item_ids: set[str] = set() + + def _all_eligible() -> Iterator[_Candidate]: + for item_id, item in items.items(): + if items_filter is not None and item_id not in items_filter: + continue + if not item.media: + continue + yield from _iter_item_candidates( + item_id, + item, + force=force, + current_version=current_version, + report=report, + ) + + for candidate in _take_with_limit(_all_eligible(), limit): + if candidate.item_id not in seen_item_ids: + seen_item_ids.add(candidate.item_id) + report.items_processed += 1 + yield candidate + + +def _chunked(candidates: list[_Candidate], size: int) -> Iterator[list[_Candidate]]: + """Split a candidate list into batches of at most `size` entries. + + `size` is guaranteed positive by the CLI layer (`Typer` validates + integer ranges); this helper does not re-validate. An empty input + yields nothing — the orchestrator never issues a no-op API call. + """ + for start in range(0, len(candidates), size): + yield candidates[start : start + size] + + +def _media_type(local_path: str) -> str: + """Map an on-disk path's extension to its Anthropic media-type string. + + The downloader writes one of `.jpg` / `.png` / `.webp` (see + `xbrain.media._FORMAT_EXTENSIONS`). Anything else came from a hand + edit of `items.json` or a future format we have not registered — + fall back to `image/jpeg` and let Anthropic reject it if the bytes + do not match. The fall-back keeps a per-photo failure out of the + total-failure raise path. + """ + suffix = Path(local_path).suffix.lower() + return _MEDIA_TYPES.get(suffix, "image/jpeg") + + +def _load_bytes(media_root: Path, local_path: str) -> bytes: + """Read the photo bytes from `data/media/`. + + Raises `OSError` (a `FileNotFoundError` subclass when the file is + missing). The orchestrator's `_recoverable_errors` tuple catches it + so a missing file is a per-photo failure (the operator can re-run + `xbrain media` to repopulate), never a whole-batch abort. + """ + return (media_root / local_path).read_bytes() + + +def _build_image_block(data: bytes, media_type: str) -> dict: + """Build one Anthropic vision content block from raw photo bytes. + + The wire shape is `{type: image, source: {type: base64, media_type, data}}`. + Tests bypass this by using a `FakeAnthropic` that does not inspect + `messages`; production uses the real SDK which validates the shape. + """ + return { + "type": "image", + "source": { + "type": "base64", + "media_type": media_type, + "data": base64.b64encode(data).decode("ascii"), + }, + } + + +def _system_prompt(language: str) -> str: + """Build the system prompt — the declarative rubric with `{language}` substituted.""" + return load_rubric("describe-image", language=language) + + +def _user_directive(batch_size: int) -> str: + """Plain-text directive that follows the image blocks in the user turn. + + Tells the model the index range so the JSON list it emits is + contractually one-to-one with the input order. Indices are + zero-based to match Python and the rubric's example. + """ + last = batch_size - 1 + return ( + f"Describe images 0 through {last}. " + "Return a JSON list with one entry per image, in the order received." + ) + + +def _extract_response_text(response: object) -> str: + """Pull the JSON-bearing text out of a vision response, stripping fences. + + The Anthropic SDK packs the model's reply in `.content` as a list + of typed blocks; only `text` blocks carry JSON. Some models wrap + the JSON in a ```json ... ``` Markdown fence despite the rubric + explicitly forbidding it — strip a single leading/trailing fence + pair so the downstream `json.loads` does not trip on a Markdown + artefact. + """ + blocks = [b for b in getattr(response, "content", []) if getattr(b, "type", None) == "text"] + if not blocks: + raise ValueError("vision response has no text block") + text = "".join(b.text for b in blocks).strip() + if text.startswith("```"): + first_newline = text.find("\n") + if first_newline != -1: + text = text[first_newline + 1 :] + if text.endswith("```"): + text = text[: -len("```")] + text = text.strip() + return text + + +def _validate_judgment_entry(entry: object, *, batch_size: int) -> dict: + """Validate one per-image judgment dict and return it on success. + + Pulled out of `_parse_batch_response` so the loop body in the + parser stays a one-liner — keeps the complexity grade off C. + Enforces the wire contract: required keys + int (not bool) index + in `[0, batch_size)` + bool decorative flag + str description. + """ + if not isinstance(entry, dict): + raise ValueError(f"vision response entry is not a JSON object: {entry!r}") + if not {"index", "is_decorative", "description"} <= entry.keys(): + raise ValueError( + f"vision response entry missing required keys " + f"(got {sorted(entry)!r}, need index/is_decorative/description)" + ) + index = entry["index"] + # `bool` is a subclass of `int` in Python — exclude it explicitly so a + # `True`/`False` index does not silently become 1/0. + if not isinstance(index, int) or isinstance(index, bool): + raise ValueError(f"vision response `index` must be int, got {index!r}") + if not (0 <= index < batch_size): + raise ValueError(f"vision response `index` {index} out of batch range [0, {batch_size})") + if not isinstance(entry["is_decorative"], bool): + raise ValueError( + f"vision response `is_decorative` must be bool, got {entry['is_decorative']!r}" + ) + if not isinstance(entry["description"], str): + raise ValueError(f"vision response `description` must be str, got {entry['description']!r}") + return entry + + +def _parse_batch_response(response: object, batch_size: int) -> list[dict]: + """Extract and validate the JSON list of judgments from a vision response. + + Splits responsibility cleanly: `_extract_response_text` deals with + transport (text blocks + Markdown fence noise), this function deals + with shape (list of judgments with unique indices), and + `_validate_judgment_entry` deals with per-entry typing. Each step + raises `ValueError` so the orchestrator's `_recoverable_errors` + tuple catches every shape violation as a per-batch failure. + """ + text = _extract_response_text(response) + decoded = json.loads(text) + if not isinstance(decoded, list): + raise ValueError(f"vision response is not a JSON list (got {type(decoded).__name__})") + seen_indices: set[int] = set() + judgments: list[dict] = [] + for entry in decoded: + validated = _validate_judgment_entry(entry, batch_size=batch_size) + index = validated["index"] + if index in seen_indices: + raise ValueError(f"vision response has duplicate index {index}") + seen_indices.add(index) + judgments.append(validated) + return judgments + + +def _apply_judgment( + candidate: _Candidate, + judgment: dict, + *, + language: str, + version: str, + described_at: datetime, +) -> MediaPhotoDescribed: + """Build the post-transition `MediaPhotoDescribed` variant for one candidate. + + Decorative entries are written with an empty description regardless + of what the model returned for the `description` field — the rubric + contract is "empty for decorative" and we enforce it at the + boundary so downstream code can rely on `is_decorative => not + description`. The pre-existing `local_path` / `width` / `height` / + `bytes_size` / `downloaded_at` fields are carried over verbatim + from the prior variant (Downloaded or Described). + """ + is_decorative = bool(judgment["is_decorative"]) + description = "" if is_decorative else str(judgment["description"]) + return MediaPhotoDescribed( + url=candidate.entry.url, + local_path=candidate.entry.local_path, + width=candidate.entry.width, + height=candidate.entry.height, + bytes_size=candidate.entry.bytes_size, + downloaded_at=candidate.entry.downloaded_at, + is_decorative=is_decorative, + description=description, + description_lang=language, + description_version=version, + described_at=described_at, + ) + + +def _record_batch_failure( + report: DescribeReport, + batch: list[_Candidate], + error: str, +) -> None: + """Mark every candidate in a failed batch as a per-photo failure. + + A per-batch API error means we have no judgments for any of the + images in that batch — each photo stays in its current variant + (`MediaPhotoDownloaded` or stale `MediaPhotoDescribed`) and is + eligible again on the next `xbrain describe` run. The counts and + `per_item_failures` map reflect the per-photo unit so a partial-batch + success on a follow-up run can still net positive without confusing + the totals. + """ + report.batches_failed += 1 + for candidate in batch: + report.photos_failed += 1 + report.per_item_failures.setdefault(candidate.item_id, []).append( + (candidate.entry.url, error) + ) + + +def describe_all( + items: dict[str, Item], + media_root: Path, + *, + model: str, + output_language: str, + description_version: str, + force: bool = False, + limit: int | None = None, + items_filter: list[str] | None = None, + batch_size: int = _DEFAULT_BATCH_SIZE, + client: object | None = None, + on_progress: Callable[[], None] | None = None, + now: Callable[[], datetime] | None = None, +) -> DescribeReport: + """Describe every eligible photo across the store; return a structured report. + + Eligibility (without `--force`): + - `MediaPhotoDownloaded` — always. + - `MediaPhotoDescribed` whose `description_version` ≠ `description_version` + (the per-call argument, sourced from `config.describe_version`). + + With `--force`: + - Every `MediaPhotoDownloaded` AND every `MediaPhotoDescribed` is + re-described. The persisted description is overwritten. + + Out of scope (every run): + - `MediaPhotoPending` — describing only runs over photos with bytes + on disk; the operator must call `xbrain media` first. + - `MediaPhotoFailed` — same reason. + - `MediaVideoPending` — photos only; vision-API support for video + is a separate phase. + + The function mutates `items` in place; the caller is expected to + wrap each batch transition with a store write (the `on_progress` + callback fires after every batch, success or failure). The + Ctrl-C-coherent invariant lives there: the store is written + between batches, never mid-API-call. + + `media_root` is the directory under which `/.` + photo files live (typically `data/media/`). + + Raises: + RuntimeError: when EVERY batch attempted in the run fails. A + total failure (a revoked API key, an exhausted quota, a + corrupted media tree) must surface as a non-zero exit. The + CLI's `_handle_cli_errors` converts this into a clean + operator message + exit code 1. + """ + if client is None: + from anthropic import Anthropic # lazy: tests inject FakeAnthropic + + client = Anthropic() # reads ANTHROPIC_API_KEY from the environment + clock: Callable[[], datetime] = now if now is not None else _utcnow + + started = time.monotonic() + report = DescribeReport() + filter_set = set(items_filter) if items_filter else None + candidates = list( + _iter_candidates( + items, + force=force, + limit=limit, + items_filter=filter_set, + current_version=description_version, + report=report, + ) + ) + if not candidates: + report.elapsed_seconds = time.monotonic() - started + return report + + system = _system_prompt(output_language) + recoverable = _recoverable_errors() + + for batch in _chunked(candidates, batch_size): + _run_one_batch( + batch=batch, + client=client, + model=model, + system=system, + output_language=output_language, + description_version=description_version, + clock=clock, + media_root=media_root, + recoverable=recoverable, + report=report, + ) + if on_progress is not None: + on_progress() + + report.elapsed_seconds = time.monotonic() - started + _raise_on_total_failure(report) + _print_partial_failure_summary(report) + return report + + +def _run_one_batch( + *, + batch: list[_Candidate], + client: object, + model: str, + system: str, + output_language: str, + description_version: str, + clock: Callable[[], datetime], + media_root: Path, + recoverable: tuple[type[Exception], ...], + report: DescribeReport, +) -> None: + """Execute one batch end-to-end: build, call, parse, apply, catch. + + Pulled out of `describe_all` to keep the outer loop's complexity + under grade C while preserving the per-batch isolation contract. + Programmer bugs (`AttributeError`, ...) and `KeyboardInterrupt` + fall outside `recoverable` and propagate. + """ + report.batches_attempted += 1 + report.photos_attempted += len(batch) + try: + content_blocks = _build_user_blocks(batch, media_root) + response = _messages_create( + client=client, + model=model, + max_tokens=_MAX_TOKENS, + system=system, + content_blocks=content_blocks, + ) + judgments = _parse_batch_response(response, batch_size=len(batch)) + _apply_batch_judgments( + batch=batch, + judgments=judgments, + language=output_language, + version=description_version, + described_at=clock(), + report=report, + ) + except recoverable as exc: + message = str(exc) + logger.warning("describe: batch failed (%d photos): %s", len(batch), message) + _record_batch_failure(report, batch, message) + + +def _raise_on_total_failure(report: DescribeReport) -> None: + """Raise `RuntimeError` when every attempted batch failed. + + A total-failure run is a non-zero-exit signal — the CLI's + `_handle_cli_errors` converts the raise into a clean operator + message + exit code 1. Pulled out so `describe_all` keeps its + complexity grade under C. + """ + if report.photos_attempted > 0 and report.photos_described == 0: + raise RuntimeError( + f"All {report.batches_attempted} describe batches failed " + f"({report.photos_attempted} photos); see warnings above for details." + ) + + +def _print_partial_failure_summary(report: DescribeReport) -> None: + """Print the SUMMARY line on partial failure — mirrors `executors.api`. + + Stays silent on a clean run (no failures = no noise) and on a + total-failure run (the raise above is the signal). The line shape + matches `xbrain.media.emit_summary_line` so log parsers can rely + on a single `SUMMARY:` prefix across all stages. + """ + if report.photos_failed > 0: + print( + f"SUMMARY: described: {report.photos_described}, " + f"failed: {report.photos_failed}, " + f"skipped: {report.photos_skipped_already_described}", + file=sys.stderr, + ) + + +def _build_user_blocks(batch: list[_Candidate], media_root: Path) -> list[dict]: + """Build the user-turn content blocks for one batch: images + directive. + + Loads each photo's bytes from disk (raises `OSError` on a missing + file — caught by the orchestrator's `_recoverable_errors` so the + whole batch is marked failed and the next run re-attempts). Each + image is encoded once; the directive text block follows last so the + rubric's "Describe images 0 through N" framing pairs with the + visual context above it. + """ + blocks: list[dict] = [] + for candidate in batch: + data = _load_bytes(media_root, candidate.entry.local_path) + blocks.append(_build_image_block(data, _media_type(candidate.entry.local_path))) + blocks.append({"type": "text", "text": _user_directive(len(batch))}) + return blocks + + +def _messages_create( + *, + client: object, + model: str, + max_tokens: int, + system: str, + content_blocks: list[dict], +) -> object: + """Thin wrapper around the Anthropic SDK's `messages.create`. + + Pulled out so the orchestrator stays readable and so tests can + inject a `FakeAnthropic` whose `messages.create` records kwargs. + Returns the raw response — the parser inspects `.content` blocks. + """ + return client.messages.create( # type: ignore[attr-defined] + model=model, + max_tokens=max_tokens, + system=system, + messages=[{"role": "user", "content": content_blocks}], + ) + + +def _apply_batch_judgments( + *, + batch: list[_Candidate], + judgments: list[dict], + language: str, + version: str, + described_at: datetime, + report: DescribeReport, +) -> None: + """Apply per-image judgments to the batch, transitioning each entry. + + Pairs each judgment with the candidate at its `index`. A judgment + count that does not match the batch size is a contract violation + (parser already enforces the index range) — if the list is short, + the missing candidates are tallied as per-photo failures so they + re-enter the candidate pool on the next run; if it is long, the + excess is a programmer bug (parser-side dup check rules out + duplicate indices already). + """ + by_index = {entry["index"]: entry for entry in judgments} + if len(by_index) != len(judgments): + # Defence in depth — `_parse_batch_response` already rejects + # duplicate indices, so reaching here means the parser was + # bypassed. Raise so the developer sees it. + raise ValueError("internal: duplicate judgment indices survived parser check") + for position, candidate in enumerate(batch): + judgment = by_index.get(position) + if judgment is None: + report.photos_failed += 1 + report.per_item_failures.setdefault(candidate.item_id, []).append( + (candidate.entry.url, f"vision response omitted index {position}") + ) + continue + new_entry = _apply_judgment( + candidate, + judgment, + language=language, + version=version, + described_at=described_at, + ) + candidate.item.media[candidate.index] = new_entry + report.photos_described += 1 + + +def emit_summary_line(report: DescribeReport, *, out: "io.IOBase | None" = None) -> None: + """Print the SUMMARY line on stderr (mirrors `media.emit_summary_line`). + + Stays silent on a fully no-op run (no attempts, no skips) — a + `--limit 0` or an `--items` filter that matched nothing produces no + noise. `out` is injectable for tests; defaults to `sys.stderr`. + """ + if report.photos_attempted == 0 and report.photos_skipped_already_described == 0: + return + target = out if out is not None else sys.stderr + print( + f"SUMMARY: described: {report.photos_described}, " + f"failed: {report.photos_failed}, " + f"skipped: {report.photos_skipped_already_described}", + file=target, + ) diff --git a/tests/test_describe.py b/tests/test_describe.py new file mode 100644 index 0000000..a3a8ff0 --- /dev/null +++ b/tests/test_describe.py @@ -0,0 +1,953 @@ +"""Tests for `xbrain.describe` — the vision-describe orchestrator. + +The Anthropic client is faked via `tests.conftest.FakeAnthropic`; no +real API calls. Photo bytes are written to a tmp `data/media/` tree so +the orchestrator can read them through its normal `_load_bytes` path. + +Coverage targets every contract the spec calls out: +- variant transitions (Downloaded → Described; Described stale → Described) +- idempotency (no-op re-runs skip already-described entries) +- batching (5-at-a-time by default; partial batch at the end) +- per-batch error isolation (one failing batch does not abort the run) +- total-failure raise (every batch errored) +- refusal handling (decorative + empty description, never crashes) +- language plumbing (the rubric ships with `{language}` substituted) +- programmer-bug propagation (`AttributeError` is NOT swallowed) +- Ctrl-C propagation (`KeyboardInterrupt` is NOT swallowed) +- stale-version logic + `--force` semantics +- summary line on stderr +""" + +from __future__ import annotations + +from datetime import datetime, timezone +from pathlib import Path + +import pytest + +from xbrain.describe import ( + DescribeReport, + _eligible, + _parse_batch_response, + _validate_judgment_entry, + describe_all, + emit_summary_line, +) +from xbrain.models import ( + Author, + Item, + MediaPhotoDescribed, + MediaPhotoDownloaded, + MediaPhotoFailed, + MediaPhotoPending, + MediaVideoPending, +) + +from tests.conftest import FakeAnthropic, FakeBlock, FakeResponse + +# --------------------------------------------------------------------- fixtures + helpers + + +def _photo_bytes_jpg() -> bytes: + """A minimal 4x3 JPEG so the loader's `read_bytes` returns something realistic. + + We do NOT exercise Pillow in describe-tests — the orchestrator only + reads the file and base64-encodes it. Any non-empty bytes work; we + use real JPEG bytes so the media-type mapping has something + plausible to round-trip. + """ + from io import BytesIO + + from PIL import Image + + buf = BytesIO() + Image.new("RGB", (4, 3), color=(1, 2, 3)).save(buf, format="JPEG") + return buf.getvalue() + + +def _write_photo(media_root: Path, item_id: str, index: int, ext: str = ".jpg") -> str: + """Write a fake photo to `data/media//`; return the rel path.""" + rel = f"{item_id}/{index}{ext}" + dst = media_root / rel + dst.parent.mkdir(parents=True, exist_ok=True) + dst.write_bytes(_photo_bytes_jpg()) + return rel + + +def _downloaded( + *, + item_id: str = "1", + index: int = 0, + url: str | None = None, + media_root: Path | None = None, +) -> MediaPhotoDownloaded: + """Build a `MediaPhotoDownloaded` whose bytes are on disk (when `media_root`).""" + rel = f"{item_id}/{index}.jpg" + if media_root is not None: + _write_photo(media_root, item_id, index) + return MediaPhotoDownloaded( + url=url or f"https://pbs.twimg.com/media/{item_id}-{index}.jpg", + local_path=rel, + width=4, + height=3, + bytes_size=512, + downloaded_at=datetime(2026, 5, 24, tzinfo=timezone.utc), + ) + + +def _described( + *, + item_id: str = "1", + index: int = 0, + version: str = "v1", + media_root: Path | None = None, +) -> MediaPhotoDescribed: + """Build a `MediaPhotoDescribed` with optional on-disk bytes.""" + rel = f"{item_id}/{index}.jpg" + if media_root is not None: + _write_photo(media_root, item_id, index) + return MediaPhotoDescribed( + url=f"https://pbs.twimg.com/media/{item_id}-{index}.jpg", + local_path=rel, + width=4, + height=3, + bytes_size=512, + downloaded_at=datetime(2026, 5, 24, tzinfo=timezone.utc), + is_decorative=False, + description="a previously-described image", + description_lang="English", + description_version=version, + described_at=datetime(2026, 5, 24, 12, tzinfo=timezone.utc), + ) + + +def _item(item_id: str, media: list) -> Item: + """Build an `Item` populated with the given media entries.""" + return Item( + id=item_id, + source="bookmark", + url=f"https://x.com/a/status/{item_id}", + author=Author(handle="a", name="A"), + text="text", + created_at=datetime(2026, 5, 10, tzinfo=timezone.utc), + captured_at=datetime(2026, 5, 24, tzinfo=timezone.utc), + media=media, + ) + + +def _judgment(index: int, *, decorative: bool = False, description: str = "ok") -> dict: + """Build one per-image judgment dict matching the wire contract.""" + return { + "index": index, + "is_decorative": decorative, + "description": "" if decorative else description, + } + + +def _payload(judgments: list[dict]) -> list[dict]: + """Wrap a list of judgments as the JSON list the rubric expects. + + `FakeAnthropic` JSON-encodes the payload as `json.dumps(payload)` — + a list payload survives that path unchanged because `dumps` accepts + any JSON-serialisable value, not just dicts. The orchestrator's + parser pulls the text from `.content[0].text` and parses it as a + JSON list. + """ + return judgments # type: ignore[return-value] + + +class _FakeListResponse: + """A `FakeResponse`-shaped object whose `.content[0].text` is a JSON list. + + `FakeAnthropic` wraps payloads in `FakeResponse(payload)` which calls + `json.dumps(payload)` — that path expects a dict. For describe we + need to ship a list payload, so we build a parallel response + type that mirrors the shape `_parse_batch_response` consumes. + """ + + def __init__(self, judgments: list[dict]): + import json + + self.content = [type(FakeBlock(payload={}))(payload={})] # placeholder block + # Overwrite the text on the block with the JSON list serialisation. + self.content[0].text = json.dumps(judgments) + + +class _FakeMessagesList: + """A fake `client.messages` that pops `_FakeListResponse` per call. + + Mirrors `tests.conftest.FakeMessages` but uses `_FakeListResponse` + so the payload is a JSON list. Exception instances are raised + rather than wrapped (same convention). + """ + + def __init__(self, payloads: list): + self._payloads = list(payloads) + self.calls: list[dict] = [] + + def create(self, **kwargs) -> _FakeListResponse: + self.calls.append(kwargs) + payload = self._payloads.pop(0) + if isinstance(payload, Exception): + raise payload + return _FakeListResponse(payload) + + +class _FakeVisionClient: + """Drop-in fake for `anthropic.Anthropic` over JSON-list responses.""" + + def __init__(self, payloads: list): + self.messages = _FakeMessagesList(payloads) + + +# --------------------------------------------------------------------- eligibility + + +def test_eligible_downloaded_always_eligible(): + """Downloaded entries are always eligible regardless of force/version.""" + entry = _downloaded() + assert _eligible(entry, force=False, current_version="v1") is True + assert _eligible(entry, force=True, current_version="v1") is True + + +def test_eligible_described_current_version_only_with_force(): + """A described entry on the current version is skipped without --force.""" + entry = _described(version="v1") + assert _eligible(entry, force=False, current_version="v1") is False + assert _eligible(entry, force=True, current_version="v1") is True + + +def test_eligible_described_stale_version_is_eligible(): + """A described entry on a stale version is eligible without --force.""" + entry = _described(version="v1") + assert _eligible(entry, force=False, current_version="v2") is True + + +def test_eligible_pending_failed_video_are_never_eligible(): + """Pending / failed / video entries are out of scope for describe.""" + pending = MediaPhotoPending(url="https://pbs.twimg.com/media/X.jpg") + failed = MediaPhotoFailed( + url="https://pbs.twimg.com/media/X.jpg", + failure_reason="http_4xx", + error="HTTP 404", + attempts=1, + last_attempt_at=datetime(2026, 5, 24, tzinfo=timezone.utc), + ) + video = MediaVideoPending(url="https://video.twimg.com/x.mp4") + for entry in (pending, failed, video): + assert _eligible(entry, force=False, current_version="v1") is False + assert _eligible(entry, force=True, current_version="v1") is False + + +# --------------------------------------------------------------------- parser + + +def test_parse_batch_response_accepts_valid_list(): + """A clean JSON list with the right keys round-trips.""" + response = _FakeListResponse([_judgment(0), _judgment(1, decorative=True)]) + out = _parse_batch_response(response, batch_size=2) + assert [e["index"] for e in out] == [0, 1] + + +def test_parse_batch_response_strips_markdown_fence(): + """Some models wrap JSON in ```json ... ``` despite the rubric.""" + import json + + class _Fenced: + content = [type("B", (), {"type": "text", "text": ""})()] + + fenced_text = "```json\n" + json.dumps([_judgment(0)]) + "\n```" + _Fenced.content[0].text = fenced_text + out = _parse_batch_response(_Fenced(), batch_size=1) + assert out[0]["index"] == 0 + + +def test_parse_batch_response_rejects_non_list_root(): + """A JSON object (not list) at the root violates the wire contract.""" + import json + + class _ObjResponse: + content = [type("B", (), {"type": "text", "text": json.dumps({"oops": True})})()] + + with pytest.raises(ValueError, match="not a JSON list"): + _parse_batch_response(_ObjResponse(), batch_size=1) + + +def test_parse_batch_response_rejects_missing_text_block(): + """A response with no text block at all cannot be parsed.""" + + class _Empty: + content: list = [] + + with pytest.raises(ValueError, match="no text block"): + _parse_batch_response(_Empty(), batch_size=1) + + +def test_parse_batch_response_rejects_duplicate_index(): + """Duplicate indices would silently overwrite a transition — reject.""" + response = _FakeListResponse([_judgment(0), _judgment(0)]) + with pytest.raises(ValueError, match="duplicate index"): + _parse_batch_response(response, batch_size=2) + + +def test_validate_judgment_entry_rejects_bool_as_int_index(): + """`bool` is a subclass of `int` in Python — exclude it explicitly.""" + with pytest.raises(ValueError, match="must be int"): + _validate_judgment_entry( + {"index": True, "is_decorative": False, "description": "x"}, + batch_size=2, + ) + + +def test_validate_judgment_entry_rejects_out_of_range_index(): + """An index outside `[0, batch_size)` is a contract violation.""" + with pytest.raises(ValueError, match="out of batch range"): + _validate_judgment_entry( + {"index": 5, "is_decorative": False, "description": "x"}, + batch_size=2, + ) + + +def test_validate_judgment_entry_rejects_non_bool_decorative(): + """`is_decorative` must be a real bool, not a truthy string.""" + with pytest.raises(ValueError, match="is_decorative"): + _validate_judgment_entry( + {"index": 0, "is_decorative": "yes", "description": "x"}, + batch_size=1, + ) + + +# --------------------------------------------------------------------- orchestrator + + +def test_describe_all_transitions_downloaded_to_described(tmp_path: Path): + """The happy path: one downloaded photo becomes one described entry.""" + media_root = tmp_path / "media" + entry = _downloaded(item_id="1", index=0, media_root=media_root) + item = _item("1", [entry]) + store = {"1": item} + client = _FakeVisionClient([[_judgment(0, description="A diagram.")]]) + report = describe_all( + store, + media_root, + model="claude-sonnet-4-6", + output_language="English", + description_version="v1", + client=client, + ) + assert isinstance(item.media[0], MediaPhotoDescribed) + assert item.media[0].description == "A diagram." + assert item.media[0].description_lang == "English" + assert item.media[0].description_version == "v1" + assert item.media[0].is_decorative is False + assert report.photos_described == 1 + assert report.photos_failed == 0 + assert report.batches_attempted == 1 + + +def test_describe_all_is_noop_for_already_described_current_version(tmp_path: Path): + """Re-running over a described entry at the current version is a no-op.""" + media_root = tmp_path / "media" + entry = _described(item_id="1", index=0, version="v1", media_root=media_root) + item = _item("1", [entry]) + store = {"1": item} + # No payloads queued: a no-op run must not call the API at all. + client = _FakeVisionClient([]) + report = describe_all( + store, + media_root, + model="claude-sonnet-4-6", + output_language="English", + description_version="v1", + client=client, + ) + assert isinstance(item.media[0], MediaPhotoDescribed) + assert item.media[0].description == "a previously-described image" + assert report.photos_described == 0 + assert report.photos_skipped_already_described == 1 + assert report.batches_attempted == 0 + assert client.messages.calls == [] + + +def test_describe_all_force_redescribes_current_version(tmp_path: Path): + """`--force` re-describes everything, current-version included.""" + media_root = tmp_path / "media" + entry = _described(item_id="1", index=0, version="v1", media_root=media_root) + item = _item("1", [entry]) + store = {"1": item} + client = _FakeVisionClient([[_judgment(0, description="updated description")]]) + describe_all( + store, + media_root, + model="claude-sonnet-4-6", + output_language="English", + description_version="v1", + force=True, + client=client, + ) + assert isinstance(item.media[0], MediaPhotoDescribed) + assert item.media[0].description == "updated description" + + +def test_describe_all_redescribes_stale_version_without_force(tmp_path: Path): + """Bumping `description_version` invalidates stale entries automatically.""" + media_root = tmp_path / "media" + entry = _described(item_id="1", index=0, version="v1", media_root=media_root) + item = _item("1", [entry]) + store = {"1": item} + client = _FakeVisionClient([[_judgment(0, description="v2 description")]]) + describe_all( + store, + media_root, + model="claude-sonnet-4-6", + output_language="English", + description_version="v2", + client=client, + ) + assert isinstance(item.media[0], MediaPhotoDescribed) + assert item.media[0].description_version == "v2" + assert item.media[0].description == "v2 description" + + +def test_describe_all_batches_five_images_per_call_by_default(tmp_path: Path): + """Default batch size is 5 — 6 photos must produce 2 API calls (5 + 1).""" + media_root = tmp_path / "media" + item = _item( + "1", + [_downloaded(item_id="1", index=i, media_root=media_root) for i in range(6)], + ) + store = {"1": item} + first_batch = [_judgment(i, description=f"img {i}") for i in range(5)] + second_batch = [_judgment(0, description="img 5")] + client = _FakeVisionClient([first_batch, second_batch]) + describe_all( + store, + media_root, + model="m", + output_language="English", + description_version="v1", + client=client, + ) + assert len(client.messages.calls) == 2 + assert all(isinstance(m, MediaPhotoDescribed) for m in item.media) + + +def test_describe_all_respects_custom_batch_size(tmp_path: Path): + """A non-default `batch_size` must control how many images per call.""" + media_root = tmp_path / "media" + item = _item( + "1", + [_downloaded(item_id="1", index=i, media_root=media_root) for i in range(4)], + ) + store = {"1": item} + client = _FakeVisionClient( + [ + [_judgment(0, description="x"), _judgment(1, description="x")], + [_judgment(0, description="x"), _judgment(1, description="x")], + ] + ) + describe_all( + store, + media_root, + model="m", + output_language="English", + description_version="v1", + batch_size=2, + client=client, + ) + assert len(client.messages.calls) == 2 + + +def test_describe_all_decorative_judgment_writes_empty_description(tmp_path: Path): + """A decorative classification produces an empty description on the variant. + + The orchestrator enforces the rubric contract even if the model + returned non-empty text — `is_decorative` implies empty description + at the boundary. + """ + media_root = tmp_path / "media" + entry = _downloaded(item_id="1", index=0, media_root=media_root) + item = _item("1", [entry]) + store = {"1": item} + # The model put text in description despite is_decorative=True; the + # orchestrator must overwrite it with the empty string. + client = _FakeVisionClient( + [[{"index": 0, "is_decorative": True, "description": "shouldnt persist"}]] + ) + describe_all( + store, + media_root, + model="m", + output_language="English", + description_version="v1", + client=client, + ) + assert isinstance(item.media[0], MediaPhotoDescribed) + assert item.media[0].is_decorative is True + assert item.media[0].description == "" + + +def test_describe_all_isolates_a_batch_error(tmp_path: Path, capsys): + """One failing batch must NOT abort the rest of the run.""" + from anthropic import APIError + + media_root = tmp_path / "media" + item_a = _item("a", [_downloaded(item_id="a", index=0, media_root=media_root)]) + item_b = _item("b", [_downloaded(item_id="b", index=0, media_root=media_root)]) + store = {"a": item_a, "b": item_b} + client = _FakeVisionClient( + [ + APIError("503 service unavailable", request=None, body=None), + [_judgment(0, description="ok")], + ] + ) + report = describe_all( + store, + media_root, + model="m", + output_language="English", + description_version="v1", + batch_size=1, + client=client, + ) + # Item a stayed Downloaded; item b transitioned. + assert isinstance(item_a.media[0], MediaPhotoDownloaded) + assert isinstance(item_b.media[0], MediaPhotoDescribed) + assert report.photos_described == 1 + assert report.photos_failed == 1 + assert report.batches_failed == 1 + err = capsys.readouterr().err + assert "SUMMARY: described: 1, failed: 1" in err + + +def test_describe_all_raises_when_every_batch_fails(tmp_path: Path): + """A total-failure run (every batch errored) must raise RuntimeError.""" + from anthropic import APIError + + media_root = tmp_path / "media" + item = _item( + "1", + [_downloaded(item_id="1", index=i, media_root=media_root) for i in range(2)], + ) + store = {"1": item} + client = _FakeVisionClient( + [ + APIError("401 unauthorized", request=None, body=None), + APIError("401 unauthorized", request=None, body=None), + ] + ) + with pytest.raises(RuntimeError, match=r"All 2 describe batches failed"): + describe_all( + store, + media_root, + model="m", + output_language="English", + description_version="v1", + batch_size=1, + client=client, + ) + + +def test_describe_all_substitutes_output_language_in_system_prompt(tmp_path: Path): + """The rubric's `{language}` placeholder must be substituted before sending.""" + media_root = tmp_path / "media" + entry = _downloaded(item_id="1", index=0, media_root=media_root) + item = _item("1", [entry]) + store = {"1": item} + client = _FakeVisionClient([[_judgment(0)]]) + describe_all( + store, + media_root, + model="m", + output_language="Spanish", + description_version="v1", + client=client, + ) + system = client.messages.calls[0]["system"] + assert "{language}" not in system + assert "Spanish" in system + + +def test_describe_all_records_language_on_described_variant(tmp_path: Path): + """The persisted `description_lang` must reflect the call-time language.""" + media_root = tmp_path / "media" + entry = _downloaded(item_id="1", index=0, media_root=media_root) + item = _item("1", [entry]) + store = {"1": item} + client = _FakeVisionClient([[_judgment(0)]]) + describe_all( + store, + media_root, + model="m", + output_language="Spanish", + description_version="v1", + client=client, + ) + assert isinstance(item.media[0], MediaPhotoDescribed) + assert item.media[0].description_lang == "Spanish" + + +def test_describe_all_propagates_programmer_bugs(tmp_path: Path): + """`AttributeError` (programmer bug) must NOT be swallowed — propagate.""" + media_root = tmp_path / "media" + entry = _downloaded(item_id="1", index=0, media_root=media_root) + item = _item("1", [entry]) + store = {"1": item} + + class _Boom: + class messages: # noqa: N801 + @staticmethod + def create(*_args, **_kwargs): + raise AttributeError("programmer bug — undefined attribute") + + with pytest.raises(AttributeError, match="programmer bug"): + describe_all( + store, + media_root, + model="m", + output_language="English", + description_version="v1", + client=_Boom(), + ) + + +def test_describe_all_propagates_keyboard_interrupt(tmp_path: Path): + """Ctrl-C must propagate — falls through the narrow Exception catch.""" + media_root = tmp_path / "media" + entry = _downloaded(item_id="1", index=0, media_root=media_root) + item = _item("1", [entry]) + store = {"1": item} + + class _CtrlC: + class messages: # noqa: N801 + @staticmethod + def create(*_args, **_kwargs): + raise KeyboardInterrupt + + with pytest.raises(KeyboardInterrupt): + describe_all( + store, + media_root, + model="m", + output_language="English", + description_version="v1", + client=_CtrlC(), + ) + + +def test_describe_all_limit_caps_attempts(tmp_path: Path): + """`--limit N` must stop after N photos have been described.""" + media_root = tmp_path / "media" + item = _item( + "1", + [_downloaded(item_id="1", index=i, media_root=media_root) for i in range(5)], + ) + store = {"1": item} + client = _FakeVisionClient([[_judgment(0), _judgment(1)]]) + describe_all( + store, + media_root, + model="m", + output_language="English", + description_version="v1", + limit=2, + batch_size=2, + client=client, + ) + described = sum(1 for m in item.media if isinstance(m, MediaPhotoDescribed)) + downloaded = sum(1 for m in item.media if isinstance(m, MediaPhotoDownloaded)) + assert described == 2 + assert downloaded == 3 + assert len(client.messages.calls) == 1 + + +def test_describe_all_items_filter_restricts_scope(tmp_path: Path): + """`--items ` must skip items not in the filter list.""" + media_root = tmp_path / "media" + item_a = _item("a", [_downloaded(item_id="a", index=0, media_root=media_root)]) + item_b = _item("b", [_downloaded(item_id="b", index=0, media_root=media_root)]) + store = {"a": item_a, "b": item_b} + client = _FakeVisionClient([[_judgment(0)]]) + describe_all( + store, + media_root, + model="m", + output_language="English", + description_version="v1", + items_filter=["b"], + client=client, + ) + assert isinstance(item_a.media[0], MediaPhotoDownloaded) + assert isinstance(item_b.media[0], MediaPhotoDescribed) + + +def test_describe_all_handles_missing_file_as_per_batch_failure(tmp_path: Path): + """A missing photo file is per-photo failure, not a total abort. + + The entry says "downloaded" but the bytes are not on disk + (operator removed `data/media/`, snapshot restored old items.json, + etc). The orchestrator catches the OSError, marks the whole batch + failed, and continues with the next batch. + """ + media_root = tmp_path / "media" + # Entry A: bytes on disk. Entry B: bytes missing on purpose. + entry_a = _downloaded(item_id="a", index=0, media_root=media_root) + entry_b_missing = MediaPhotoDownloaded( + url="https://pbs.twimg.com/media/b.jpg", + local_path="b/0.jpg", + width=4, + height=3, + bytes_size=512, + downloaded_at=datetime(2026, 5, 24, tzinfo=timezone.utc), + ) + item_a = _item("a", [entry_a]) + item_b = _item("b", [entry_b_missing]) + store = {"a": item_a, "b": item_b} + client = _FakeVisionClient([[_judgment(0)], [_judgment(0)]]) + report = describe_all( + store, + media_root, + model="m", + output_language="English", + description_version="v1", + batch_size=1, + client=client, + ) + assert isinstance(item_a.media[0], MediaPhotoDescribed) + assert isinstance(item_b.media[0], MediaPhotoDownloaded) # stayed Downloaded + assert report.photos_described == 1 + assert report.photos_failed == 1 + + +def test_describe_all_emits_summary_line_on_partial_failure(tmp_path: Path, capsys): + """Partial-failure runs emit a SUMMARY line; same convention as `media`.""" + from anthropic import APIError + + media_root = tmp_path / "media" + item = _item( + "1", + [_downloaded(item_id="1", index=i, media_root=media_root) for i in range(2)], + ) + store = {"1": item} + client = _FakeVisionClient( + [ + [_judgment(0)], + APIError("503", request=None, body=None), + ] + ) + describe_all( + store, + media_root, + model="m", + output_language="English", + description_version="v1", + batch_size=1, + client=client, + ) + err = capsys.readouterr().err + assert "SUMMARY: described: 1, failed: 1" in err + + +def test_describe_all_emits_no_summary_when_all_succeed(tmp_path: Path, capsys): + """A clean batch stays silent on stderr — same convention as `media`.""" + media_root = tmp_path / "media" + entry = _downloaded(item_id="1", index=0, media_root=media_root) + item = _item("1", [entry]) + store = {"1": item} + client = _FakeVisionClient([[_judgment(0)]]) + describe_all( + store, + media_root, + model="m", + output_language="English", + description_version="v1", + client=client, + ) + err = capsys.readouterr().err + assert "SUMMARY:" not in err + + +def test_describe_all_fires_on_progress_after_each_batch(tmp_path: Path): + """The Ctrl-C-coherent invariant: persistence fires between batches.""" + media_root = tmp_path / "media" + item = _item( + "1", + [_downloaded(item_id="1", index=i, media_root=media_root) for i in range(3)], + ) + store = {"1": item} + client = _FakeVisionClient( + [ + [_judgment(0)], + [_judgment(0)], + [_judgment(0)], + ] + ) + calls: list[int] = [] + describe_all( + store, + media_root, + model="m", + output_language="English", + description_version="v1", + batch_size=1, + client=client, + on_progress=lambda: calls.append(len(calls) + 1), + ) + assert calls == [1, 2, 3] + + +def test_describe_all_skips_pending_failed_and_video_silently(tmp_path: Path): + """Non-downloaded variants are silently out of scope — no API call, no failure.""" + media_root = tmp_path / "media" + pending = MediaPhotoPending(url="https://pbs.twimg.com/media/p.jpg") + failed = MediaPhotoFailed( + url="https://pbs.twimg.com/media/f.jpg", + failure_reason="http_4xx", + error="HTTP 404", + attempts=1, + last_attempt_at=datetime(2026, 5, 24, tzinfo=timezone.utc), + ) + video = MediaVideoPending(url="https://video.twimg.com/x.mp4") + item = _item("1", [pending, failed, video]) + store = {"1": item} + client = _FakeVisionClient([]) # no calls expected + report = describe_all( + store, + media_root, + model="m", + output_language="English", + description_version="v1", + client=client, + ) + assert report.photos_attempted == 0 + assert report.photos_described == 0 + assert report.batches_attempted == 0 + assert client.messages.calls == [] + + +def test_describe_all_total_failure_does_not_emit_summary(tmp_path: Path, capsys): + """The all-failed branch raises BEFORE the summary print — verify.""" + from anthropic import APIError + + media_root = tmp_path / "media" + entry = _downloaded(item_id="1", index=0, media_root=media_root) + item = _item("1", [entry]) + store = {"1": item} + client = _FakeVisionClient([APIError("503", request=None, body=None)]) + with pytest.raises(RuntimeError): + describe_all( + store, + media_root, + model="m", + output_language="English", + description_version="v1", + client=client, + ) + err = capsys.readouterr().err + assert "SUMMARY:" not in err + + +def test_describe_all_carries_over_local_path_dimensions(tmp_path: Path): + """The described variant inherits the on-disk fields verbatim — no re-decode.""" + media_root = tmp_path / "media" + entry = _downloaded(item_id="1", index=0, media_root=media_root) + # Tweak the fields so we can prove they round-tripped intact. + entry = MediaPhotoDownloaded( + url=entry.url, + local_path=entry.local_path, + width=1920, + height=1080, + bytes_size=123456, + downloaded_at=entry.downloaded_at, + ) + item = _item("1", [entry]) + store = {"1": item} + client = _FakeVisionClient([[_judgment(0)]]) + describe_all( + store, + media_root, + model="m", + output_language="English", + description_version="v1", + client=client, + ) + described = item.media[0] + assert isinstance(described, MediaPhotoDescribed) + assert described.local_path == "1/0.jpg" + assert described.width == 1920 + assert described.height == 1080 + assert described.bytes_size == 123456 + + +def test_describe_all_uses_injectable_clock(tmp_path: Path): + """`now` injection lets tests assert the timestamp deterministically.""" + media_root = tmp_path / "media" + entry = _downloaded(item_id="1", index=0, media_root=media_root) + item = _item("1", [entry]) + store = {"1": item} + client = _FakeVisionClient([[_judgment(0)]]) + fixed = datetime(2030, 1, 1, 12, tzinfo=timezone.utc) + describe_all( + store, + media_root, + model="m", + output_language="English", + description_version="v1", + client=client, + now=lambda: fixed, + ) + described = item.media[0] + assert isinstance(described, MediaPhotoDescribed) + assert described.described_at == fixed + + +# --------------------------------------------------------------------- summary line + + +def test_emit_summary_line_silent_when_nothing_happened(capsys): + """No attempts, no skips → no SUMMARY noise. Same convention as media.""" + emit_summary_line(DescribeReport()) + assert "SUMMARY:" not in capsys.readouterr().err + + +def test_emit_summary_line_prints_when_described_or_skipped(capsys): + """A non-zero `photos_described` triggers the SUMMARY line.""" + report = DescribeReport( + photos_attempted=1, + photos_described=1, + ) + emit_summary_line(report) + err = capsys.readouterr().err + assert "SUMMARY: described: 1, failed: 0, skipped: 0" in err + + +def test_emit_summary_line_prints_when_only_skipped(capsys): + """An all-skipped no-op (idempotency proof) emits the SUMMARY line too.""" + report = DescribeReport(photos_skipped_already_described=3) + emit_summary_line(report) + err = capsys.readouterr().err + assert "skipped: 3" in err + + +# --------------------------------------------------------------------- response helper + + +def test_fake_response_block_carries_text_attribute(): + """`tests.conftest.FakeBlock` requires both `type` and `text` attributes.""" + block = FakeBlock(payload={"x": 1}) + assert block.type == "text" + assert isinstance(block.text, str) + + +def test_fake_response_round_trips(): + """Smoke check on the `FakeResponse` machinery the tests rely on.""" + payload = {"summary": "x", "primary_topic": "x", "topics": ["x"]} + response = FakeResponse(payload) + assert response.content[0].text + + +def test_fake_anthropic_records_calls(): + """`FakeAnthropic.messages.calls` records every kwarg dict — relied on by other tests.""" + client = FakeAnthropic([{"x": 1}]) + client.messages.create(model="m", system="s", messages=[]) + assert client.messages.calls[0]["model"] == "m" From 3c302f8646a6f52183ccbc2f1934030adec7c7ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Victor=20Gonz=C3=A1lez-Pacheco?= Date: Sun, 24 May 2026 19:31:35 +0200 Subject: [PATCH 04/11] [#34] add xbrain describe CLI command + snapshot trigger The CLI surface mirrors xbrain media: --force, --limit, --items, --model, --batch-size, --verbose. The output_language and the description_version both come from config (config.toml + new [describe] section); --model overrides config.describe_model per invocation. The pre-describe snapshot is auto-taken so a runaway prompt or wrong model can be rolled back with xbrain snapshot restore. Persistence is double-anchored: on_progress writes the store between batches (Ctrl-C-coherent invariant) and a finally clause catches any total-failure RuntimeError so the per-photo MediaPhotoDescribed records that landed before the raise are not discarded. config.toml.example documents the new [describe] section with defaults (Sonnet 4.6 + version v1). 4 CLI tests cover the happy path, no-op exit-0, --items mismatch warning, and total-failure exit-1 propagation. Co-Authored-By: Claude Opus 4.7 (1M context) --- config.toml.example | 11 ++++ src/xbrain/cli.py | 128 ++++++++++++++++++++++++++++++++++++++ src/xbrain/config.py | 14 +++++ tests/test_cli.py | 145 +++++++++++++++++++++++++++++++++++++++++++ tests/test_config.py | 27 ++++++++ 5 files changed, 325 insertions(+) diff --git a/config.toml.example b/config.toml.example index adcf202..cb5bfc7 100644 --- a/config.toml.example +++ b/config.toml.example @@ -39,3 +39,14 @@ language = "English" # "wikilink" - **Topics:** [[ai-coding]] · [[software-engineering]] (default) # "hashtag" - **Topics:** #ai-coding #software-engineering topic_style = "wikilink" + +[describe] +# Vision model used by `xbrain describe`. Sonnet 4.6 is the default — +# the quality / cost sweet spot (~$3-5 for a 2k-image corpus). Override +# per run with `--model` while iterating; the CLI flag wins. +model = "claude-sonnet-4-6" +# Description-version tag persisted on every described photo. Bumping +# this value invalidates existing descriptions: the next `xbrain +# describe` run re-describes stale entries automatically. Use it when +# you change the describe-image rubric or expectations. +version = "v1" diff --git a/src/xbrain/cli.py b/src/xbrain/cli.py index 7227722..3d9e3be 100644 --- a/src/xbrain/cli.py +++ b/src/xbrain/cli.py @@ -14,6 +14,8 @@ from xbrain import snapshot from xbrain.archive import parse_archive from xbrain.config import Config, load_config +from xbrain.describe import describe_all as run_describe_all +from xbrain.describe import emit_summary_line as describe_emit_summary_line from xbrain.diff import diff_snapshots, format_json, format_text from xbrain.enrich import apply_worksheet_judgments, enrich_with_executor, items_pending_enrichment from xbrain.executors.api import ApiExecutor @@ -359,6 +361,132 @@ def media( _run_media(cfg, force=force, limit=limit, items_filter=items_filter, verbose=verbose) +def _run_describe( + cfg: Config, + *, + force: bool, + limit: int | None, + items_filter: list[str] | None, + model: str, + batch_size: int, + verbose: bool, +) -> None: + """Run the vision-describe orchestrator and persist after every batch. + + Always snapshots `data/` first (the same recovery boundary as + `xbrain media`): a botched run — a wrong model, a runaway prompt + — can be undone with `xbrain snapshot restore`. Persistence + happens twice for the same reason as `_run_media`: once after + every batch (the `on_progress` callback fires between batches so + Ctrl-C mid-run leaves `items.json` coherent), and once + unconditionally at the end so the elapsed-described timestamps + are captured even if the orchestrator raised on total failure. + """ + if items_filter: + target = set(items_filter) + store_ids = set(load_store(cfg.items_path)) + missing = target - store_ids + if missing and not (target & store_ids): + typer.echo( + f"AVISO: --items {','.join(items_filter)} no coincide con ningún item " + f"del store ({len(store_ids)} items). El run será un no-op.", + err=True, + ) + _auto_snapshot(cfg, "describe") + store = load_store(cfg.items_path) + + def _persist() -> None: + save_store(store, cfg.items_path) + + try: + report = run_describe_all( + store, + cfg.media_dir, + model=model, + output_language=cfg.output_language, + description_version=cfg.describe_version, + force=force, + limit=limit, + items_filter=items_filter, + batch_size=batch_size, + on_progress=_persist, + ) + finally: + # Persist whatever transitioned, even if `describe_all` raised. A + # RuntimeError on total failure must not discard the per-photo + # MediaPhotoDescribed records that landed before the raise. + save_store(store, cfg.items_path) + describe_emit_summary_line(report) + typer.echo( + f"Describe: descritas {report.photos_described}, " + f"fallidas {report.photos_failed}, " + f"saltadas {report.photos_skipped_already_described}" + ) + if verbose and report.per_item_failures: + typer.echo("Failed photos:", err=True) + for item_id, failures in sorted(report.per_item_failures.items()): + for url, error in failures: + typer.echo(f" {item_id} {url} {error}", err=True) + + +@app.command() +@_handle_cli_errors +def describe( + force: bool = typer.Option( + False, + "--force", + help="Re-describir todas las fotos, incluso las ya descritas en la versión actual.", + ), + limit: int | None = typer.Option( + None, + "--limit", + help="Máximo número de fotos a describir en esta ejecución.", + ), + items: str | None = typer.Option( + None, + "--items", + help="IDs de items separados por comas para limitar el alcance del run.", + ), + model: str | None = typer.Option( + None, + "--model", + help="Modelo de visión a usar. Si no se pasa, se usa el del config (`describe.model`).", + ), + batch_size: int = typer.Option( + 5, + "--batch-size", + min=1, + help="Número de imágenes por llamada a la API. 5 es el sweet spot (12-15%% ahorro de tokens).", + ), + verbose: bool = typer.Option( + False, + "--verbose", + help="Imprime cada foto fallida (item_id, URL, error) al final del run.", + ), +) -> None: + """Describe las fotos descargadas con un LLM de visión. + + Solo describe fotos con bytes en disco (`MediaPhotoDownloaded`). + Las entradas ya descritas en la versión actual se saltan; bumpear + `[describe].version` en `config.toml` fuerza un re-describe + automático sin `--force`. Las descripciones se persisten en + `items.json` y son consumidas por `xbrain enrich` y `xbrain topics` + en las llamadas LLM subsiguientes. + """ + cfg = _config() + items_filter = [s.strip() for s in items.split(",") if s.strip()] if items else None + chosen_model = model or cfg.describe_model + _run_describe( + cfg, + force=force, + limit=limit, + items_filter=items_filter, + model=chosen_model, + batch_size=batch_size, + verbose=verbose, + ) + + @app.command() @_handle_cli_errors def enrich( diff --git a/src/xbrain/config.py b/src/xbrain/config.py index 06568da..69c2f6a 100644 --- a/src/xbrain/config.py +++ b/src/xbrain/config.py @@ -29,6 +29,17 @@ class Config: topics_resynth_threshold: int output_language: str # one of xbrain.i18n.SUPPORTED_LANGUAGES topic_style: str # one of xbrain.config.SUPPORTED_TOPIC_STYLES + # `describe_model` defaults to Sonnet 4.6 — the spec settled on it as the + # quality / cost sweet spot for vision (~$3-5 for a 2k-image corpus). + # Override per run via `xbrain describe --model ...` when iterating on + # prompt or budget; the CLI flag wins over the config value. + describe_model: str + # `describe_version` tags every produced description so a prompt + # evolution can be rolled out incrementally: bumping the value here + # makes the next `xbrain describe` run re-describe stale entries + # automatically (no `--force` needed). The string is exact-match — + # there is no ordering relation, only equality. + describe_version: str @property def items_path(self) -> Path: @@ -95,6 +106,7 @@ def load_config(repo_root: Path) -> Config: f"config.toml: [output].topic_style must be one of " f"{list(SUPPORTED_TOPIC_STYLES)}, got {topic_style!r}" ) + describe = settings.get("describe", {}) return Config( repo_root=repo_root, vault=vault, @@ -107,4 +119,6 @@ def load_config(repo_root: Path) -> Config: topics_resynth_threshold=resynth_threshold, output_language=output_language, topic_style=topic_style, + describe_model=describe.get("model", "claude-sonnet-4-6"), + describe_version=describe.get("version", "v1"), ) diff --git a/tests/test_cli.py b/tests/test_cli.py index c4f3cc2..d36d04b 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -834,3 +834,148 @@ def test_vocab_rejects_unknown_executor(tmp_path, monkeypatch): result = runner.invoke(app, ["vocab", "--executor", "bogus"]) assert result.exit_code == 1 assert "bogus" in result.output + + +# ----------------------------------------------------------------------- describe + + +def _setup_describe_repo(tmp_path: Path, monkeypatch) -> Path: + """Like `_setup_repo` but with a pre-populated photo on disk + Downloaded variant.""" + import io as _io + from datetime import datetime, timezone + + from PIL import Image + + from xbrain.models import Author, Item, MediaPhotoDownloaded + + _setup_repo(tmp_path, monkeypatch) + media_root = tmp_path / "data" / "media" + (media_root / "42").mkdir(parents=True, exist_ok=True) + buffer = _io.BytesIO() + Image.new("RGB", (8, 6), color=(10, 20, 30)).save(buffer, format="JPEG") + (media_root / "42" / "0.jpg").write_bytes(buffer.getvalue()) + + item = Item( + id="42", + source="bookmark", + url="https://x.com/a/status/42", + author=Author(handle="a", name="A"), + text="text", + created_at=datetime(2026, 5, 10, tzinfo=timezone.utc), + captured_at=datetime(2026, 5, 24, tzinfo=timezone.utc), + media=[ + MediaPhotoDownloaded( + url="https://pbs.twimg.com/media/42-0.jpg", + local_path="42/0.jpg", + width=8, + height=6, + bytes_size=200, + downloaded_at=datetime(2026, 5, 24, tzinfo=timezone.utc), + ) + ], + ) + save_store({"42": item}, tmp_path / "data" / "items.json") + return tmp_path + + +def test_describe_command_transitions_downloaded_to_described(tmp_path: Path, monkeypatch): + """End-to-end: a Downloaded photo becomes Described via the CLI command.""" + import json as _json + + from xbrain.models import MediaPhotoDescribed + from xbrain.store import load_store + + _setup_describe_repo(tmp_path, monkeypatch) + + class _FakeBlock: + type = "text" + + def __init__(self, text: str): + self.text = text + + class _FakeResp: + def __init__(self, text: str): + self.content = [_FakeBlock(text)] + + class _FakeMessages: + def __init__(self): + self.calls: list[dict] = [] + + def create(self, **kwargs): + self.calls.append(kwargs) + payload = _json.dumps([{"index": 0, "is_decorative": False, "description": "A chart."}]) + return _FakeResp(payload) + + class _FakeClient: + def __init__(self): + self.messages = _FakeMessages() + + import xbrain.cli as cli + + fake_client = _FakeClient() + + def _patched_run(store, media_root, **kwargs): + kwargs["client"] = fake_client + return _orig(store, media_root, **kwargs) + + _orig = cli.run_describe_all + monkeypatch.setattr(cli, "run_describe_all", _patched_run) + + result = runner.invoke(app, ["describe"]) + assert result.exit_code == 0, result.output + reloaded = load_store(tmp_path / "data" / "items.json") + entry = reloaded["42"].media[0] + assert isinstance(entry, MediaPhotoDescribed) + assert entry.description == "A chart." + assert entry.description_lang == "English" + assert entry.description_version == "v1" + + +def test_describe_command_runs_on_empty_store(tmp_path: Path, monkeypatch): + """No items → describe is a no-op exit-0, with a snapshot still taken.""" + _setup_repo(tmp_path, monkeypatch) + result = runner.invoke(app, ["describe"]) + assert result.exit_code == 0 + # A snapshot was created — recovery boundary mirrors media. + snapshots = list((tmp_path / "data" / "snapshots").glob("*-describe")) + assert snapshots, "describe must auto-snapshot data/ before running" + + +def test_describe_command_warns_when_items_filter_matches_nothing(tmp_path, monkeypatch): + """`--items` with no matches surfaces an AVISO line — same pattern as media.""" + _setup_describe_repo(tmp_path, monkeypatch) + result = runner.invoke(app, ["describe", "--items", "no-such-id"]) + assert result.exit_code == 0 + assert "AVISO" in result.output or "AVISO" in (result.stderr or "") + + +def test_describe_command_propagates_total_failure_as_exit_1(tmp_path, monkeypatch): + """A total-failure RuntimeError surfaces as exit code 1 via `_handle_cli_errors`.""" + from anthropic import APIError + + _setup_describe_repo(tmp_path, monkeypatch) + + class _AlwaysFailingMessages: + def __init__(self): + self.calls: list[dict] = [] + + def create(self, **kwargs): + self.calls.append(kwargs) + raise APIError("401 unauthorized", request=None, body=None) + + class _FakeClient: + def __init__(self): + self.messages = _AlwaysFailingMessages() + + import xbrain.cli as cli + + def _patched(store, media_root, **kwargs): + kwargs["client"] = _FakeClient() + return _orig(store, media_root, **kwargs) + + _orig = cli.run_describe_all + monkeypatch.setattr(cli, "run_describe_all", _patched) + + result = runner.invoke(app, ["describe"]) + assert result.exit_code == 1 + assert "Error" in result.output diff --git a/tests/test_config.py b/tests/test_config.py index 5a2ddbe..752dea5 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -195,3 +195,30 @@ def test_load_config_rejects_unknown_topic_style(tmp_path: Path): ) with pytest.raises(ValueError, match="topic_style"): load_config(tmp_path) + + +def test_load_config_describe_settings_have_defaults(tmp_path: Path): + """No [describe] section → Sonnet 4.6 + version v1 (the spec defaults).""" + _write_repo(tmp_path) + cfg = load_config(tmp_path) + assert cfg.describe_model == "claude-sonnet-4-6" + assert cfg.describe_version == "v1" + + +def test_load_config_round_trips_describe_overrides(tmp_path: Path): + """[describe] section overrides — operators can pin a different model + version.""" + (tmp_path / "config.toml").write_text( + "[paths]\n" + 'vault = "/tmp/vault"\n' + 'output_subdir = "learnings/x-knowledge"\n' + 'data_dir = "data"\n' + "[x]\n" + 'handle = "vgonpa"\n' + "[describe]\n" + 'model = "claude-opus-4-1"\n' + 'version = "v3"\n', + encoding="utf-8", + ) + cfg = load_config(tmp_path) + assert cfg.describe_model == "claude-opus-4-1" + assert cfg.describe_version == "v3" From 49287ac4ddc084297b451592948fe5aef8101187 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Victor=20Gonz=C3=A1lez-Pacheco?= Date: Sun, 24 May 2026 19:35:12 +0200 Subject: [PATCH 05/11] [#34] integrate image descriptions into enrich + topics prompts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The enrich-time prompt in executors/api.py now carries an "Images in this post:" section listing every non-decorative MediaPhotoDescribed entry's prose, sitting between the post body and the links/article block so the LLM reads context in natural order. Decorative photos are filtered at the seam — they introduce no topic noise. TopicInput grows an image_descriptions field; build_topic_inputs flattens content-bearing descriptions across every post in the topic and feeds them to the topic-synth prompt below the per-post summaries. Same decorative-filter as enrich. The prompt shape stays back-compat: items / topics without described photos emit no images section at all. 8 new tests guard the integration: inclusion, decorative-filter, omission-when-empty, ordering vs links/article. Both _user_prompt helpers were split into per-section builders to keep complexity under grade C — no new C functions introduced. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/xbrain/executors/api.py | 84 +++++++++++++++++++++++++++++-------- src/xbrain/topic_synth.py | 24 ++++++++++- src/xbrain/topics.py | 43 +++++++++++++++---- tests/test_executors_api.py | 62 +++++++++++++++++++++++++++ tests/test_topic_synth.py | 25 +++++++++++ tests/test_topics.py | 54 ++++++++++++++++++++++++ 6 files changed, 265 insertions(+), 27 deletions(-) diff --git a/src/xbrain/executors/api.py b/src/xbrain/executors/api.py index 2b18138..4f9c26e 100644 --- a/src/xbrain/executors/api.py +++ b/src/xbrain/executors/api.py @@ -13,7 +13,7 @@ from xbrain.executors.base import EnrichmentJudgment from xbrain.llm_json import json_from_response -from xbrain.models import ContentSourceSuccess, Item, Topic +from xbrain.models import ContentSourceSuccess, Item, MediaPhotoDescribed, Topic from xbrain.rubrics import ARTICLE_CHAR_LIMIT, load_rubric _MAX_TOKENS = 600 @@ -60,6 +60,69 @@ def _system_prompt(language: str) -> str: ) +def _content_image_descriptions(item: Item) -> list[str]: + """Return non-decorative image descriptions on the item, in media order. + + Decorative photos (`is_decorative=True`) are filtered out at this + seam so they introduce no topic noise — an avatar or a reaction + meme would otherwise drag the assigned topics toward whatever the + image happened to depict. Items without described photos return an + empty list. + """ + return [ + entry.description + for entry in item.media + if isinstance(entry, MediaPhotoDescribed) + and not entry.is_decorative + and entry.description + ] + + +def _images_section(item: Item) -> list[str]: + """Build the `Images in this post:` block, or an empty list when not applicable. + + Visual content carries topic signal too. The describe stage + already filtered decoratives — this just splices the prose in + right before the article body so the LLM reads the post + the + image evidence + the article in natural order. + """ + image_descriptions = _content_image_descriptions(item) + if not image_descriptions: + return [] + lines = ["", "Images in this post:"] + lines += [f"- {description}" for description in image_descriptions] + return lines + + +def _links_section(item: Item) -> list[str]: + """Build the `Links in the post:` block, or an empty list when not applicable.""" + if not item.links: + return [] + lines = [ + "", + "Links in the post (the domain is topic signal even when " + "the article body is unavailable):", + ] + lines += [f"- {ln.url} (domain: {ln.domain})" for ln in item.links] + return lines + + +def _article_sections(item: Item) -> list[str]: + """Build one block per successfully-fetched article. Empty if no content.""" + if item.content is None or not item.content.sources: + return [] + lines: list[str] = [] + for src in item.content.sources: + # Narrow to the success variant — only those carry `title`/`text`. + if isinstance(src, ContentSourceSuccess) and src.text: + lines += [ + "", + f"Linked article ({src.title or src.url}):", + src.text[:ARTICLE_CHAR_LIMIT], + ] + return lines + + def _user_prompt(item: Item, vocab: list[Topic]) -> str: parts = [ "Controlled vocabulary (use only these slugs):", @@ -70,22 +133,9 @@ def _user_prompt(item: Item, vocab: list[Topic]) -> str: ] if item.bookmark_folder: parts += ["", f"Saved by the user in the bookmark folder: {item.bookmark_folder}"] - if item.links: - parts += [ - "", - "Links in the post (the domain is topic signal even when " - "the article body is unavailable):", - ] - parts += [f"- {ln.url} (domain: {ln.domain})" for ln in item.links] - if item.content and item.content.sources: - # Narrow to the success variant — only those carry `title`/`text`. - for src in item.content.sources: - if isinstance(src, ContentSourceSuccess) and src.text: - parts += [ - "", - f"Linked article ({src.title or src.url}):", - src.text[:ARTICLE_CHAR_LIMIT], - ] + parts += _images_section(item) + parts += _links_section(item) + parts += _article_sections(item) return "\n".join(parts) diff --git a/src/xbrain/topic_synth.py b/src/xbrain/topic_synth.py index 94a7b41..be5243a 100644 --- a/src/xbrain/topic_synth.py +++ b/src/xbrain/topic_synth.py @@ -9,7 +9,7 @@ import json import sys -from dataclasses import dataclass +from dataclasses import dataclass, field from datetime import datetime, timezone from pathlib import Path @@ -37,11 +37,21 @@ class OverviewJudgment(BaseModel): @dataclass class TopicInput: - """Everything an executor needs to synthesize one topic's overview.""" + """Everything an executor needs to synthesize one topic's overview. + + `summaries` is the per-post text already enrichment produced. + `image_descriptions` is the vision-LLM-produced prose for every + non-decorative photo in the topic's posts — fed to the + topic-synthesis LLM alongside the summaries so the topic page + reflects visual evidence that the per-post enrichment may have + only partially captured. Decorative photos are excluded at the + seam (mirrors `xbrain.executors.api._content_image_descriptions`). + """ slug: str description: str summaries: list[str] + image_descriptions: list[str] = field(default_factory=list) def _system_prompt(language: str) -> str: @@ -64,6 +74,16 @@ def _user_prompt(topic: TopicInput) -> str: f"Summaries of the {len(topic.summaries)} posts in this topic:", ] lines += [f"- {summary}" for summary in topic.summaries] + if topic.image_descriptions: + # Visual evidence supplements the per-post summaries: a chart + # or screenshot in a topic carries signal the summary may have + # only hinted at. Decorative photos are filtered out by the + # caller (`build_topic_inputs`) so they introduce no noise. + lines += [ + "", + f"Images across the {len(topic.image_descriptions)} content-bearing photos in this topic:", + ] + lines += [f"- {description}" for description in topic.image_descriptions] return "\n".join(lines) diff --git a/src/xbrain/topics.py b/src/xbrain/topics.py index cdfeb32..0efa49f 100644 --- a/src/xbrain/topics.py +++ b/src/xbrain/topics.py @@ -13,7 +13,7 @@ from xbrain import notes_io from xbrain.i18n import Strings, strings_for -from xbrain.models import Item, Topic, TopicPage +from xbrain.models import Item, MediaPhotoDescribed, Topic, TopicPage from xbrain.topic_synth import OverviewJudgment, TopicInput @@ -172,10 +172,34 @@ def topics_needing_synth( return needing +def _collect_summaries(item_pool: list[Item]) -> list[str]: + """Flatten the non-empty `enriched.summary` strings across a pool of items.""" + return [ + item.enriched.summary for item in item_pool if item.enriched and item.enriched.summary + ] + + +def _collect_image_descriptions(item_pool: list[Item]) -> list[str]: + """Flatten content-bearing image descriptions across a pool of items. + + Decorative photos are filtered out at this seam so the topic prompt + never carries avatar / reaction-meme noise — same filter the enrich + prompt applies in `xbrain.executors.api`. + """ + return [ + entry.description + for item in item_pool + for entry in item.media + if isinstance(entry, MediaPhotoDescribed) + and not entry.is_decorative + and entry.description + ] + + def build_topic_inputs( slugs: list[str], vocab: list[Topic], all_posts: dict[str, TopicPosts] ) -> list[TopicInput]: - """Build the synthesis input for each slug — its description + post summaries.""" + """Build the synthesis input for each slug — description + summaries + image descriptions.""" by_slug = {topic.slug: topic for topic in vocab} inputs: list[TopicInput] = [] for slug in slugs: @@ -183,12 +207,15 @@ def build_topic_inputs( if topic is None: raise ValueError(f"slug '{slug}' is not in the vocabulary") posts = all_posts.get(slug, TopicPosts()) - summaries = [ - item.enriched.summary - for item in (posts.primary + posts.also) - if item.enriched and item.enriched.summary - ] - inputs.append(TopicInput(slug=slug, description=topic.description, summaries=summaries)) + item_pool = posts.primary + posts.also + inputs.append( + TopicInput( + slug=slug, + description=topic.description, + summaries=_collect_summaries(item_pool), + image_descriptions=_collect_image_descriptions(item_pool), + ) + ) return inputs diff --git a/tests/test_executors_api.py b/tests/test_executors_api.py index c022aad..ca3dd47 100644 --- a/tests/test_executors_api.py +++ b/tests/test_executors_api.py @@ -194,3 +194,65 @@ def test_api_executor_emits_no_summary_when_all_succeed(capsys): ex = ApiExecutor(model="m", output_language="English", client=client) ex.enrich_items([_item("1")], VOCAB) assert "SUMMARY:" not in capsys.readouterr().err + + +def _described_photo(*, description: str, decorative: bool = False): + """Build a `MediaPhotoDescribed` for prompt-integration tests.""" + from datetime import datetime, timezone + + from xbrain.models import MediaPhotoDescribed + + return MediaPhotoDescribed( + url="https://pbs.twimg.com/media/X.jpg", + local_path="1/0.jpg", + width=4, + height=3, + bytes_size=512, + downloaded_at=datetime(2026, 5, 24, tzinfo=timezone.utc), + is_decorative=decorative, + description=description, + description_lang="English", + description_version="v1", + described_at=datetime(2026, 5, 24, 12, tzinfo=timezone.utc), + ) + + +def test_user_prompt_includes_content_bearing_image_descriptions(): + """Non-decorative described photos must surface as `Images in this post:` lines.""" + item = _item("1", media=[_described_photo(description="A chart of MMLU scores.")]) + prompt = _user_prompt(item, VOCAB) + assert "Images in this post:" in prompt + assert "A chart of MMLU scores." in prompt + + +def test_user_prompt_excludes_decorative_image_descriptions(): + """Decorative photos must NOT appear in the prompt — pure noise.""" + item = _item("1", media=[_described_photo(description="ignored", decorative=True)]) + prompt = _user_prompt(item, VOCAB) + assert "Images in this post:" not in prompt + + +def test_user_prompt_omits_images_section_when_no_described_photos(): + """Items without any described photos must NOT have the images section. + + Otherwise the LLM sees a hint of missing context and may hallucinate + visual evidence that does not exist. Regression guard. + """ + item = _item("1") # no media + prompt = _user_prompt(item, VOCAB) + assert "Images in this post:" not in prompt + + +def test_user_prompt_image_descriptions_precede_links_and_article(): + """Image section sits between the post body and the links/article.""" + from xbrain.models import Link + + item = _item( + "1", + media=[_described_photo(description="A diagram of GraphQL caching.")], + links=[Link(url="https://example.com/x", domain="example.com")], + ) + prompt = _user_prompt(item, VOCAB) + image_idx = prompt.index("Images in this post:") + links_idx = prompt.index("Links in the post") + assert image_idx < links_idx diff --git a/tests/test_topic_synth.py b/tests/test_topic_synth.py index 67d4fdd..fb2c770 100644 --- a/tests/test_topic_synth.py +++ b/tests/test_topic_synth.py @@ -229,3 +229,28 @@ def test_synthesize_overviews_api_emits_no_summary_when_all_succeed(capsys): client=client, ) assert "SUMMARY:" not in capsys.readouterr().err + + +def test_user_prompt_includes_image_descriptions_when_provided(): + """When `image_descriptions` is non-empty, the prompt carries an `Images` section.""" + from xbrain.topic_synth import _user_prompt + + topic_input = TopicInput( + slug="ai-coding", + description="LLMs writing software", + summaries=["s1"], + image_descriptions=["A flowchart of a feedback loop.", "Code in a terminal."], + ) + prompt = _user_prompt(topic_input) + assert "Images across the 2 content-bearing photos" in prompt + assert "A flowchart of a feedback loop." in prompt + assert "Code in a terminal." in prompt + + +def test_user_prompt_omits_image_section_when_empty(): + """Default empty `image_descriptions` produces no image section — regression guard.""" + from xbrain.topic_synth import _user_prompt + + topic_input = TopicInput(slug="x", description="d", summaries=["s"]) + prompt = _user_prompt(topic_input) + assert "Images across" not in prompt diff --git a/tests/test_topics.py b/tests/test_topics.py index 3e59dc4..75db3a3 100644 --- a/tests/test_topics.py +++ b/tests/test_topics.py @@ -197,6 +197,60 @@ def test_build_topic_inputs_rejects_an_unknown_slug(): build_topic_inputs(["not-a-real-topic"], _VOCAB, {}) +def _described_photo_in_topic(*, description: str, decorative: bool = False): + """Build a `MediaPhotoDescribed` for the topic-inputs filtering test.""" + from datetime import datetime, timezone + + from xbrain.models import MediaPhotoDescribed + + return MediaPhotoDescribed( + url="https://pbs.twimg.com/media/X.jpg", + local_path="1/0.jpg", + width=4, + height=3, + bytes_size=512, + downloaded_at=datetime(2026, 5, 24, tzinfo=timezone.utc), + is_decorative=decorative, + description=description, + description_lang="English", + description_version="v1", + described_at=datetime(2026, 5, 24, 12, tzinfo=timezone.utc), + ) + + +def test_build_topic_inputs_collects_content_image_descriptions(): + """Content-bearing photos contribute their descriptions; decoratives are filtered.""" + from xbrain.topics import build_topic_inputs + + primary_item = _enriched("1", "ai-coding", ["ai-coding"]) + primary_item.media = [ + _described_photo_in_topic(description="A chart of model accuracy."), + _described_photo_in_topic(description="ignored", decorative=True), + ] + also_item = _enriched("2", "career", ["career", "ai-coding"]) + also_item.media = [ + _described_photo_in_topic(description="A screenshot of a CI dashboard."), + ] + posts = {"ai-coding": TopicPosts()} + posts["ai-coding"].primary.append(primary_item) + posts["ai-coding"].also.append(also_item) + inputs = build_topic_inputs(["ai-coding"], _VOCAB, posts) + assert sorted(inputs[0].image_descriptions) == [ + "A chart of model accuracy.", + "A screenshot of a CI dashboard.", + ] + + +def test_build_topic_inputs_empty_when_no_described_photos(): + """An item without any described media contributes an empty image list.""" + from xbrain.topics import build_topic_inputs + + posts = {"ai-coding": TopicPosts()} + posts["ai-coding"].primary.append(_enriched("1", "ai-coding", ["ai-coding"])) + inputs = build_topic_inputs(["ai-coding"], _VOCAB, posts) + assert inputs[0].image_descriptions == [] + + def test_compute_topic_posts_dedups_duplicate_slugs_in_topics(): # A store record with a duplicate slug in `enriched.topics` (pre-validation # data) must place the item once, not twice, in the also-relevant list. From 110b3659e871c980a723ab34e88cdea1a42c1bad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Victor=20Gonz=C3=A1lez-Pacheco?= Date: Sun, 24 May 2026 19:36:18 +0200 Subject: [PATCH 06/11] [#34] extend xbrain diff with described counts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The diff module already grew the described column + delta_described field in the model commit. This commit adds the two regression tests that prove the diff surfaces a downloaded → described transition between snapshots and that the text renderer emits the new "described:" row alongside the existing four. Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/test_diff.py | 66 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/tests/test_diff.py b/tests/test_diff.py index 6b28838..c708b7d 100644 --- a/tests/test_diff.py +++ b/tests/test_diff.py @@ -581,6 +581,72 @@ def test_diff_media_counts_video_pending_separately(tmp_path: Path) -> None: assert report.media.delta_video_pending == 1 +def test_diff_media_reports_delta_described(tmp_path: Path) -> None: + """`xbrain describe` advances downloaded → described; the diff surfaces +N described. + + Setup: snapshot A has 1 downloaded photo. Snapshot B has the same + URL transitioned to described (same on-disk bytes, plus the new + description payload). The transition shows up as + `delta_downloaded=-1, delta_described=+1`. + """ + from xbrain.models import MediaPhotoDescribed, MediaPhotoDownloaded + + a = tmp_path / "a" + b = tmp_path / "b" + item_a = _item_with_media( + "1", + [ + MediaPhotoDownloaded( + url="https://pbs.twimg.com/media/A.png", + local_path="1/0.png", + width=10, + height=8, + bytes_size=100, + downloaded_at=datetime(2026, 5, 24, tzinfo=timezone.utc), + ) + ], + ) + item_b = _item_with_media( + "1", + [ + MediaPhotoDescribed( + url="https://pbs.twimg.com/media/A.png", + local_path="1/0.png", + width=10, + height=8, + bytes_size=100, + downloaded_at=datetime(2026, 5, 24, tzinfo=timezone.utc), + is_decorative=False, + description="A chart of accuracy by model.", + description_lang="English", + description_version="v1", + described_at=datetime(2026, 5, 24, 12, tzinfo=timezone.utc), + ) + ], + ) + _seed(a, items={"1": item_a}, vocab_slugs=[]) + _seed(b, items={"1": item_b}, vocab_slugs=[]) + + report = diff_snapshots(a, b) + assert report.media.a.downloaded == 1 + assert report.media.a.described == 0 + assert report.media.b.downloaded == 0 + assert report.media.b.described == 1 + assert report.media.delta_downloaded == -1 + assert report.media.delta_described == 1 + assert report.summary.media_delta_described == 1 + + +def test_diff_media_text_format_includes_described_row(tmp_path: Path) -> None: + """The text renderer emits a `described:` row alongside the others.""" + a = tmp_path / "a" + b = tmp_path / "b" + _seed(a, items={"1": _item_with_media("1", [])}, vocab_slugs=[]) + _seed(b, items={"1": _item_with_media("1", [])}, vocab_slugs=[]) + text = format_text(diff_snapshots(a, b)) + assert "described:" in text + + def test_diff_media_text_format_includes_block(tmp_path: Path) -> None: """The text renderer includes a `MEDIA` block with the four counters.""" from xbrain.models import MediaPhotoDownloaded From d71a30a55a92d2d9f7337c8c28a3a38810f4c2ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Victor=20Gonz=C3=A1lez-Pacheco?= Date: Sun, 24 May 2026 19:38:42 +0200 Subject: [PATCH 07/11] [#34] update README + ARCHITECTURE for the describe stage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit README grows a Commands row for describe, a Configuration row for [describe].model and [describe].version, and a Vision descriptions subsection in the Local media storage section that explains the $3-5 budget, the decorative-filter, and the description-version re-describe trigger. ARCHITECTURE adds a Per-stage describe section between fetch and vocab covering the state machine, batching, refusal handling, failure isolation, snapshot trigger, and the consumption seam in executors/api.py + topic_synth.py. The rubrics table grows a describe-image row; invariant #10 names the fifth Media variant and makes the linear-pipeline shape explicit (Pending → Downloaded → Described, with Failed as the off-ramp from Pending). Also bundles two formatter-only cleanups picked up by ruff format on the executors/api.py and topics.py changes from the prompt integration commit. Co-Authored-By: Claude Opus 4.7 (1M context) --- ARCHITECTURE.md | 34 ++++++++++++++++++++++++++++++++-- README.md | 29 ++++++++++++++++++++++++++++- src/xbrain/executors/api.py | 7 ++----- src/xbrain/topics.py | 8 ++------ 4 files changed, 64 insertions(+), 14 deletions(-) diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index b4fa467..4bdbc15 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -266,6 +266,35 @@ The numbered stages above are summarised; the sections below cover each one in d **Snapshot trigger.** `xbrain media` always snapshots `data/` first (label `pre-media`), mirroring the destructive-op recovery boundary. The snapshot covers `items.json` / `state.json` / `vocab.yaml` / `topics.json` only — the binary photo bytes under `data/media/` are NOT included; re-downloading via `xbrain media` is the recovery path. +### describe + +**What it does.** Sends every downloaded photo to a Claude vision model, asks for a 1-3 sentence prose description plus a `is_decorative` classification, and persists the prose on the entry. The entry transitions from `MediaPhotoDownloaded` to `MediaPhotoDescribed` (a fifth variant on the `MediaEntry` union, on top of the four `media` already exposes). Decorative photos (avatars, reaction memes, abstract backgrounds) are classified as such with an empty description so downstream prompts can filter them out without re-classifying. + +**Reads.** `data/items.json` + `data/media//.` (the bytes the downloader wrote). + +**Writes.** `data/items.json` — each described photo entry carries `is_decorative` + `description` + `description_lang` + `description_version` + `described_at`. No new on-disk binary state; the bytes from the prior `MediaPhotoDownloaded` are inherited verbatim. + +**State machine.** Each `xbrain describe` run advances eligible photo entries: +- `Downloaded` → `Described` (description on the entry, bytes unchanged). +- `Described` (stale version) → `Described` (current version), automatically. +- `Described` (current version) → no-op (skipped) unless `--force`. + +Eligibility ignores `Pending` / `Failed` / `VideoPending`: describe only runs on photos with bytes on disk. The description-version tag is the rubric-evolution lever: bumping `[describe].version` in `config.toml` invalidates persisted entries so the next run re-describes them without `--force`. + +**Batching.** Default batch size is 5 images per API call (the spec's quality / cost sweet spot — ~12-15 % token saving vs per-image, modest added complexity). Override with `--batch-size N`. + +**Refusals.** Vision refusals (faces, NSFW) are NOT a hard failure: the entry is persisted as decorative with an empty description, and the run continues. The same `is_decorative` flag downstream consumers already use for "no topic signal" handles the refusal uniformly. + +**Failure isolation.** Per-batch error isolation: one failing API call does not abort the run. A total-failure run (every batch errored) raises `RuntimeError` so the CLI surfaces non-zero exit. The orchestrator's `on_progress` callback writes `items.json` between batches so Ctrl-C mid-run leaves the store coherent — same recovery contract as `media`. + +**Snapshot trigger.** `xbrain describe` always snapshots `data/` first (label `pre-describe`), mirroring `media`'s recovery boundary. A botched run — wrong model, runaway prompt — can be undone with `xbrain snapshot restore`. + +**Feeds the LLM stages.** Once described, the prose is consumed automatically: +- `xbrain enrich` (in `executors/api.py:_user_prompt`) splices an `Images in this post:` section between the post body and the links/article block when the item has content-bearing described photos. Decoratives are filtered. +- `xbrain topics` (in `topic_synth.py:_user_prompt`) appends the flat list of content-bearing image descriptions across every post in a topic, after the per-post summaries. + +This is how a tweet that is mostly a screenshot of a paper becomes searchable by what the screenshot was actually about. + ### vocab **What it does.** Induces a closed taxonomy of ~30-45 topics from the whole corpus. Map step: chunks the corpus, asks an LLM to propose candidate topics per chunk. Reduce step: asks the LLM to consolidate the union of candidates down to `vocab.target_count` topics. Always includes a `misc` topic for posts with no thematic core. @@ -333,7 +362,7 @@ Everything XBrain knows lives in four files inside `data/` (gitignored). They ar | File | Format | What it is | Mutated by | |------|--------|------------|------------| -| `items.json` | JSON array of `Item` | The source of truth — every post XBrain has ever seen, with all fetched content and enrichment | `extract`, `fetch`, `enrich`, `media` | +| `items.json` | JSON array of `Item` | The source of truth — every post XBrain has ever seen, with all fetched content, enrichment, and per-photo vision descriptions | `extract`, `fetch`, `enrich`, `media`, `describe` | | `state.json` | JSON | Extractor cursors (`last_seen_id`, `last_run`) per source, archive-import marker | `extract`, `import-archive` | | `vocab.yaml` | YAML list of `Topic` | The controlled topic taxonomy — closed list of slugs + descriptions | `vocab` | | `topics.json` | JSON dict of `TopicPage` | The synthesized topic-page overviews and notes, keyed by slug | `topics` | @@ -355,6 +384,7 @@ The LLM-driven stages (`vocab`, `enrich`, `topics`) do not have their instructio | `rubric-topics.md` | `enrich` | Assign one `primary_topic` + 0-3 secondaries from the closed vocab. Never invent slugs | | `rubric-summary.md` | `enrich` | Write a 1-3 sentence summary, faithful to the post and the fetched article, no hallucination | | `rubric-topic-page.md` | `topics` | Synthesize 1-3 paragraphs of plain prose + up to 15 short notes per topic, zero wikilinks | +| `rubric-describe-image.md` | `describe` | Classify each photo as decorative vs content-bearing and describe content-bearing ones in 1-3 sentences. Refusals fall through as decorative with empty description | **Why a separate file per rubric.** Changing how XBrain summarizes posts is editing one markdown file, not chasing a string through the codebase. The rubric is the *contract* between code and LLM; the code only handles structure, transport and validation. @@ -438,7 +468,7 @@ These are the rules the rest of the architecture rests on. Breaking any of them 7. **Operation names, not query ids.** The extractor anchors to X GraphQL operation names because X rotates the ids. Anything that hardcodes an id will break. 8. **Destructive ops are reversible.** Every command that overwrites a `data/` artifact (`vocab --regenerate`, `topics --resynth`, `fetch --force`) snapshots `data/` first to `data/snapshots/-pre-/`. `xbrain snapshot restore ` is the recovery path. A snapshot failure aborts the destructive op. 9. **Fetch records are tagged unions.** A `ContentSource` on `items.json` is either a `Success` (with required `text`) or a `Failure` (with required `failure_reason`). Mixed shapes are not representable — pydantic rejects them at construction, and mypy rejects them statically (via the `pydantic.mypy` plugin). Legacy records with `ok: bool` (pre-#20) are normalised on read by a `BeforeValidator` on the union, so existing `data/items.json` files keep working without a manual migration. The static contract is pinned by `tests/type_probes/illegal_states.py`. -10. **Media variants are mutually exclusive states.** A `MediaEntry` on `items.json` is one of `MediaPhotoPending` / `MediaPhotoDownloaded` / `MediaPhotoFailed` / `MediaVideoPending`, discriminated by `kind`. State transitions happen only via `xbrain media`. Legacy records with the flat `{type, url}` shape are normalised on read by a `BeforeValidator` on the union — no manual migration needed. (See the `### media` section above for the retry contract and storage layout.) +10. **Media variants are mutually exclusive states.** A `MediaEntry` on `items.json` is one of `MediaPhotoPending` / `MediaPhotoDownloaded` / `MediaPhotoFailed` / `MediaPhotoDescribed` / `MediaVideoPending`, discriminated by `kind`. The photo states form a linear pipeline: `Pending → Downloaded → Described` (with `Failed` as the off-ramp from `Pending`). State transitions happen only via `xbrain media` (advances `Pending` and retries `Failed`) and `xbrain describe` (advances `Downloaded` to `Described`). Legacy records with the flat `{type, url}` shape are normalised on read by a `BeforeValidator` on the union — no manual migration needed. (See the `### media` and `### describe` sections above for the per-stage contracts.) --- diff --git a/README.md b/README.md index 817b922..aedaf0d 100644 --- a/README.md +++ b/README.md @@ -397,6 +397,8 @@ topic_style = "wikilink" # wikilink | hashtag (in-body Topics: | `[topics]` | `resynth_threshold` | `25` | Post growth that marks a topic overview stale. | | `[output]` | `language` | `English` | Output language for LLM summaries/overviews AND wiki section headers. `English` or `Spanish`. | | `[output]` | `topic_style` | `wikilink` | How the in-body `**Topics:**` line is rendered: `wikilink` (`[[slug]] · [[slug]]`) or `hashtag` (`#slug #slug`). Frontmatter `tags:` are unaffected. | +| `[describe]` | `model` | `claude-sonnet-4-6` | Vision model for `xbrain describe`. Override per run with `--model`. | +| `[describe]` | `version` | `v1` | Tag persisted on every described photo. Bumping invalidates existing descriptions so the next `xbrain describe` re-describes stale entries. | Switching `[output].language` after the corpus is already enriched is supported — but does not retroactively translate existing summaries. To convert the @@ -517,6 +519,7 @@ uv run xbrain [options] | `import-archive ` | Backfill the full own-tweet history from the official X data archive. | | `fetch` | Download linked article content, expand threads, fetch linked X content. By default, items whose only previous failures were transient (`timeout`, `dns_error`) are re-fetched automatically; terminal failures (`not_found`, `paywall`, `forbidden`, `js_required`, `empty_content`) stay skipped until `--force`. `--force` re-fetches every external_article source regardless of state. | | `media` | Download X-post photos referenced in `Item.media` and render them inline in the wiki. `--force`, `--limit N`, `--items `, `--verbose`. See [Local media storage](#local-media-storage). | +| `describe` | Describe downloaded photos with a vision LLM (Claude Sonnet 4.6 by default) and feed the prose into `enrich` + `topics`. `--force`, `--limit N`, `--items `, `--model`, `--batch-size`, `--verbose`. Idempotent — re-runs skip already-described photos unless `[describe].version` is bumped in `config.toml`. | | `vocab` | Induce the topic taxonomy. `--executor`, `--apply `, `--regenerate`. | | `enrich` | Enrich items with a summary + topics. `--executor`, `--apply `. | | `topics` | Synthesise topic pages. `--executor`, `--apply `, `--resynth`. | @@ -583,7 +586,31 @@ Failures are categorised on the item itself the next `xbrain media` run. Run `xbrain diff ` after a media run to see how many photos -moved from `pending` / `failed` into `downloaded`. +moved from `pending` / `failed` into `downloaded` (or, after `xbrain +describe`, into `described`). + +**Vision descriptions** + +Once `xbrain media` has the bytes on disk, `xbrain describe` runs every +photo through Claude vision and stores a short prose description on +the entry (transitioning `MediaPhotoDownloaded` → `MediaPhotoDescribed`). +Descriptions are 1-3 sentences, faithful, in the configured +`output_language`. Decorative photos (avatars, reaction memes, +abstract backgrounds) are classified as such and persisted with an +empty description so they introduce no topic noise downstream. + +`xbrain enrich` and `xbrain topics` consume the descriptions +automatically: an item with content-bearing photos gets an +`Images in this post:` block in the enrichment prompt; topic-page +synthesis sees the flat list of content image descriptions across the +topic's posts. This is how a tweet that is mostly a screenshot of a +paper becomes searchable by what the screenshot was actually about. + +Describing the full corpus costs about $3-5 with the default model +(Sonnet 4.6, 5 images per call). Bump `[describe].version` in +`config.toml` to invalidate stored descriptions when you change the +rubric — the next `xbrain describe` run re-describes stale entries +automatically without `--force`. --- diff --git a/src/xbrain/executors/api.py b/src/xbrain/executors/api.py index 4f9c26e..1172aab 100644 --- a/src/xbrain/executors/api.py +++ b/src/xbrain/executors/api.py @@ -72,9 +72,7 @@ def _content_image_descriptions(item: Item) -> list[str]: return [ entry.description for entry in item.media - if isinstance(entry, MediaPhotoDescribed) - and not entry.is_decorative - and entry.description + if isinstance(entry, MediaPhotoDescribed) and not entry.is_decorative and entry.description ] @@ -100,8 +98,7 @@ def _links_section(item: Item) -> list[str]: return [] lines = [ "", - "Links in the post (the domain is topic signal even when " - "the article body is unavailable):", + "Links in the post (the domain is topic signal even when the article body is unavailable):", ] lines += [f"- {ln.url} (domain: {ln.domain})" for ln in item.links] return lines diff --git a/src/xbrain/topics.py b/src/xbrain/topics.py index 0efa49f..aea284b 100644 --- a/src/xbrain/topics.py +++ b/src/xbrain/topics.py @@ -174,9 +174,7 @@ def topics_needing_synth( def _collect_summaries(item_pool: list[Item]) -> list[str]: """Flatten the non-empty `enriched.summary` strings across a pool of items.""" - return [ - item.enriched.summary for item in item_pool if item.enriched and item.enriched.summary - ] + return [item.enriched.summary for item in item_pool if item.enriched and item.enriched.summary] def _collect_image_descriptions(item_pool: list[Item]) -> list[str]: @@ -190,9 +188,7 @@ def _collect_image_descriptions(item_pool: list[Item]) -> list[str]: entry.description for item in item_pool for entry in item.media - if isinstance(entry, MediaPhotoDescribed) - and not entry.is_decorative - and entry.description + if isinstance(entry, MediaPhotoDescribed) and not entry.is_decorative and entry.description ] From a215fb68921d785d82c05c03bfb8eacd70f6a5bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Victor=20Gonz=C3=A1lez-Pacheco?= Date: Sun, 24 May 2026 20:22:28 +0200 Subject: [PATCH 08/11] [#34] dedupe SUMMARY + harden MediaPhotoDescribed invariants MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Drop `_print_partial_failure_summary` from `describe_all`: the CLI's `describe_emit_summary_line` is the single source of truth (mirrors `media`). Eliminates the double `SUMMARY:` emission on partial-failure runs (python-code-reviewer BLOCKER). - `MediaPhotoDescribed` now carries: - `description_lang: SupportedLanguage` (typed alias over `i18n.SUPPORTED_LANGUAGES`) — rejects unknown languages at the type boundary (type-design-analyzer). - Model validator enforcing `is_decorative => description == ""` so a hand-constructed variant cannot violate the invariant (type-design-analyzer). - `described_at` UTC-aware enforcement via field_validator. - Dedupe the `local_path` traversal guard via a shared free function (`_reject_local_path_traversal`); same for the UTC-aware check (`_require_utc_aware`). Closes the validator duplication called out by python-code-reviewer + code-simplifier — done via free functions rather than class inheritance because Liskov substitution would silently re-match `MediaPhotoDescribed` against 25+ `isinstance(entry, MediaPhotoDownloaded)` call sites. - Add the same UTC-aware contract to `MediaPhotoDownloaded.downloaded_at` and `MediaPhotoFailed.last_attempt_at` (closes the pre-existing gap for the variants this PR touches). - Tests: orchestrator no longer emits SUMMARY on partial failure (assert `err.count("SUMMARY:") == 0` — pinning the dedup). Test helpers for decorative described-photo construction honour the new empty-description invariant. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/xbrain/describe.py | 31 ++---- src/xbrain/models.py | 182 ++++++++++++++++++++++++++---------- tests/test_describe.py | 16 +++- tests/test_executors_api.py | 10 +- tests/test_topics.py | 10 +- 5 files changed, 169 insertions(+), 80 deletions(-) diff --git a/src/xbrain/describe.py b/src/xbrain/describe.py index 61f514e..80a2cb8 100644 --- a/src/xbrain/describe.py +++ b/src/xbrain/describe.py @@ -10,15 +10,16 @@ The structure mirrors `xbrain.executors.api` and `xbrain.topic_synth`: a recoverable-errors tuple, per-batch failure isolation, -`logger.warning` on every failure, `RuntimeError` on total failure, and -a `SUMMARY: described: N, failed: M, skipped: K` stderr line for the -CLI. Programmer bugs (`AttributeError`) and `KeyboardInterrupt` -propagate — narrow `except` clauses only. +`logger.warning` on every failure, and `RuntimeError` on total failure. +Programmer bugs (`AttributeError`) and `KeyboardInterrupt` propagate +— narrow `except` clauses only. The `SUMMARY: described: N, failed: M, +skipped: K` stderr line is emitted exclusively by the CLI via +`emit_summary_line` (single source of truth, same shape as `media`). I/O dependencies (the Anthropic client, the media root path) are keyword-injectable so tests run offline. The store mutation is in -place; the caller is expected to wrap each transition with a -store-write (the `on_progress` callback fires after every batch). +place; the caller wraps each transition with a store-write +(the `on_progress` callback fires after every batch). """ from __future__ import annotations @@ -588,7 +589,6 @@ def describe_all( report.elapsed_seconds = time.monotonic() - started _raise_on_total_failure(report) - _print_partial_failure_summary(report) return report @@ -653,23 +653,6 @@ def _raise_on_total_failure(report: DescribeReport) -> None: ) -def _print_partial_failure_summary(report: DescribeReport) -> None: - """Print the SUMMARY line on partial failure — mirrors `executors.api`. - - Stays silent on a clean run (no failures = no noise) and on a - total-failure run (the raise above is the signal). The line shape - matches `xbrain.media.emit_summary_line` so log parsers can rely - on a single `SUMMARY:` prefix across all stages. - """ - if report.photos_failed > 0: - print( - f"SUMMARY: described: {report.photos_described}, " - f"failed: {report.photos_failed}, " - f"skipped: {report.photos_skipped_already_described}", - file=sys.stderr, - ) - - def _build_user_blocks(batch: list[_Candidate], media_root: Path) -> list[dict]: """Build the user-turn content blocks for one batch: images + directive. diff --git a/src/xbrain/models.py b/src/xbrain/models.py index b018b1e..39f1cad 100644 --- a/src/xbrain/models.py +++ b/src/xbrain/models.py @@ -6,7 +6,21 @@ from datetime import datetime from typing import Annotated, Any, Literal, Union -from pydantic import BaseModel, BeforeValidator, Field, TypeAdapter, field_validator +from pydantic import ( + BaseModel, + BeforeValidator, + Field, + TypeAdapter, + field_validator, + model_validator, +) + +from xbrain.i18n import SUPPORTED_LANGUAGES + +# Type alias mirroring `i18n.SUPPORTED_LANGUAGES` so the data layer rejects +# unknown languages at construction time. `Literal[*tuple]` unpacking is +# Python 3.11+ — we are on 3.12 — and keeps the source of truth in `i18n`. +SupportedLanguage = Literal[*SUPPORTED_LANGUAGES] # type: ignore[valid-type] logger = logging.getLogger(__name__) @@ -93,6 +107,47 @@ class MediaPhotoPending(_MediaPhotoBase): kind: Literal["photo_pending"] = "photo_pending" +def _reject_local_path_traversal(value: str) -> str: + """Reject absolute paths and `..` components on a `local_path`. + + Shared by `MediaPhotoDownloaded` and `MediaPhotoDescribed` so the + on-disk path contract has one source of truth. `local_path` is + joined onto `data/media/` at render and download time. A persisted + record like ``"/etc/passwd"`` or ``"../../x"`` would let a poisoned + `items.json` exfiltrate bytes outside the media root. The downloader + never builds such a path, but the persisted store is on-disk plain + JSON the user can edit — defence in depth at the type boundary is + cheap. + + The two variants share the validator via this free function rather + than via class inheritance: the rest of the codebase relies on + `isinstance(entry, MediaPhotoDownloaded)` meaning "exactly the + Downloaded state" in many call sites (the photo states form a + tagged union, not a Liskov hierarchy). + """ + if value.startswith("/") or value.startswith("\\"): + raise ValueError(f"local_path must be relative, got {value!r}") + # Normalise separators before scanning components so a Windows-style + # path persisted on a foreign machine is still caught. + for part in value.replace("\\", "/").split("/"): + if part == "..": + raise ValueError(f"local_path must not contain '..' components: {value!r}") + return value + + +def _require_utc_aware(field_name: str, value: datetime) -> datetime: + """Reject naive timestamps — every persisted instant must be UTC-aware. + + Shared by every photo-variant timestamp (`downloaded_at`, + `described_at`, `last_attempt_at`) so a hand-edited naive record on + a foreign machine does not silently miscompare against + `now(timezone.utc)`. We do not coerce — that would mask the bug. + """ + if value.tzinfo is None: + raise ValueError(f"{field_name} must be timezone-aware, got naive {value!r}") + return value + + class MediaPhotoDownloaded(_MediaPhotoBase): """A photo successfully downloaded; the bytes exist at `local_path`. @@ -106,6 +161,9 @@ class MediaPhotoDownloaded(_MediaPhotoBase): "downloaded" photo is semantically illegal — Pillow validation in `xbrain.media._decode_image` rules out the dim=0 case at the seam, and the type constraint pins it at the data layer too. + + `downloaded_at` MUST be timezone-aware (UTC) so a hand-edited naive + timestamp does not silently miscompare against `now(timezone.utc)`. """ kind: Literal["photo_downloaded"] = "photo_downloaded" @@ -117,25 +175,15 @@ class MediaPhotoDownloaded(_MediaPhotoBase): @field_validator("local_path") @classmethod - def _reject_path_traversal(cls, value: str) -> str: - """Reject absolute paths and `..` components. - - `local_path` is joined onto `data/media/` at render and download - time. A persisted record like ``"/etc/passwd"`` or ``"../../x"`` - would let a poisoned `items.json` exfiltrate bytes outside the - media root. The downloader code never builds such a path, but - the persisted store is on-disk plain JSON the user can edit — - defence in depth at the type boundary is cheap. - """ + def _validate_local_path(cls, value: str) -> str: _ = cls # required by @field_validator+@classmethod; placate vulture - if value.startswith("/") or value.startswith("\\"): - raise ValueError(f"local_path must be relative, got {value!r}") - # Normalise separators before scanning components so a Windows-style - # path persisted on a foreign machine is still caught. - for part in value.replace("\\", "/").split("/"): - if part == "..": - raise ValueError(f"local_path must not contain '..' components: {value!r}") - return value + return _reject_local_path_traversal(value) + + @field_validator("downloaded_at") + @classmethod + def _validate_downloaded_at(cls, value: datetime) -> datetime: + _ = cls + return _require_utc_aware("downloaded_at", value) class MediaPhotoFailed(_MediaPhotoBase): @@ -147,6 +195,9 @@ class MediaPhotoFailed(_MediaPhotoBase): each transient retry bumps it. `attempts` is `ge=1`: a "failed but never attempted" record is semantically nonsense, and the downloader increments before producing any `MediaPhotoFailed`. + + `last_attempt_at` MUST be timezone-aware (UTC) — same contract as + every other persisted instant on the photo variants. """ kind: Literal["photo_failed"] = "photo_failed" @@ -155,29 +206,49 @@ class MediaPhotoFailed(_MediaPhotoBase): attempts: int = Field(ge=1) last_attempt_at: datetime + @field_validator("last_attempt_at") + @classmethod + def _validate_last_attempt_at(cls, value: datetime) -> datetime: + _ = cls + return _require_utc_aware("last_attempt_at", value) + class MediaPhotoDescribed(_MediaPhotoBase): """A downloaded photo that has also been described by a vision LLM. The terminal state for a content-bearing photo: the bytes still live - at `local_path` (every `MediaPhotoDownloaded` invariant carries over) - AND a vision pass has classified the image and written a short prose - description. Decorative photos (avatars, reaction memes, abstract - backgrounds) reach this variant too — they carry `is_decorative=True` - and an empty `description`, so downstream callers can filter them out + at `local_path` (every `MediaPhotoDownloaded` invariant carries over + verbatim — `local_path` + dimensions + `bytes_size` + `downloaded_at` + are re-declared so the field shape is wire-identical) AND a vision + pass has classified the image and written a short prose description. + Decorative photos (avatars, reaction memes, abstract backgrounds) + reach this variant too — they carry `is_decorative=True` and an + empty `description`, so downstream callers can filter them out without re-classifying. + Why NOT inherit from `MediaPhotoDownloaded`? The photo states form a + pydantic tagged union, not a Liskov class hierarchy. 25+ call sites + use `isinstance(entry, MediaPhotoDownloaded)` to mean "exactly the + Downloaded state" (eligibility checks, diff counters, generator + routing). Inheritance would silently re-match every Described entry + against those checks. We dedupe the field-traversal validator and + the UTC-aware validator via free functions instead. + `description_version` is the rubric/version tag the description was produced under. Bumping the configured version invalidates existing entries: the next `xbrain describe` run treats them as stale and - re-describes (no `--force` needed). `description_lang` records the - language the prose was written in so a vault that switches - `output_language` mid-corpus can be re-described selectively. - - `description` is empty for decorative photos by contract — the - vision rubric returns an empty string in that case and a refusal - (faces, NSFW) is also bucketed as decorative with empty description. - No `gt=0` constraint here for that reason. + re-describes (no `--force` needed). `description_lang` is typed as + `SupportedLanguage` so an unknown language is rejected at + construction — the type alias is derived from `i18n.SUPPORTED_LANGUAGES` + (single source of truth). + + `description` is empty for decorative photos by contract — enforced + by a model-level validator below so a hand-constructed variant + cannot violate the invariant. The vision rubric returns an empty + string in the decorative case and a refusal (faces, NSFW) is also + bucketed as decorative with empty description. + + `described_at` and `downloaded_at` MUST be timezone-aware (UTC). """ kind: Literal["photo_described"] = "photo_described" @@ -188,28 +259,43 @@ class MediaPhotoDescribed(_MediaPhotoBase): downloaded_at: datetime is_decorative: bool description: str - description_lang: str + description_lang: SupportedLanguage description_version: str described_at: datetime @field_validator("local_path") @classmethod - def _reject_path_traversal(cls, value: str) -> str: - """Reject absolute paths and `..` components. - - Mirrors `MediaPhotoDownloaded._reject_path_traversal` — the - described variant inherits the same on-disk path contract because - the bytes from the prior Downloaded state are still referenced - verbatim. Persisted `items.json` is on-disk plain JSON the user - can edit, so the defence in depth is reapplied here. - """ + def _validate_local_path(cls, value: str) -> str: _ = cls - if value.startswith("/") or value.startswith("\\"): - raise ValueError(f"local_path must be relative, got {value!r}") - for part in value.replace("\\", "/").split("/"): - if part == "..": - raise ValueError(f"local_path must not contain '..' components: {value!r}") - return value + return _reject_local_path_traversal(value) + + @field_validator("downloaded_at") + @classmethod + def _validate_downloaded_at(cls, value: datetime) -> datetime: + _ = cls + return _require_utc_aware("downloaded_at", value) + + @field_validator("described_at") + @classmethod + def _validate_described_at(cls, value: datetime) -> datetime: + _ = cls + return _require_utc_aware("described_at", value) + + @model_validator(mode="after") + def _decorative_implies_empty_description(self) -> "MediaPhotoDescribed": + """A decorative photo MUST have an empty description (rubric contract). + + Enforced at the type boundary so downstream callers can rely on + `is_decorative => not description` without re-validating. The + producer (`describe._apply_judgment`) already overwrites the + description to "" on decorative judgments; this is the + defence-in-depth for hand-edited records. + """ + if self.is_decorative and self.description != "": + raise ValueError( + f"is_decorative=True requires an empty description, got {self.description!r}" + ) + return self class MediaVideoPending(BaseModel): diff --git a/tests/test_describe.py b/tests/test_describe.py index a3a8ff0..5072b95 100644 --- a/tests/test_describe.py +++ b/tests/test_describe.py @@ -516,8 +516,10 @@ def test_describe_all_isolates_a_batch_error(tmp_path: Path, capsys): assert report.photos_described == 1 assert report.photos_failed == 1 assert report.batches_failed == 1 + # The orchestrator does not emit SUMMARY — the CLI does. Verify the + # orchestrator stays silent so the CLI is the single source of truth. err = capsys.readouterr().err - assert "SUMMARY: described: 1, failed: 1" in err + assert "SUMMARY:" not in err def test_describe_all_raises_when_every_batch_fails(tmp_path: Path): @@ -719,8 +721,14 @@ def test_describe_all_handles_missing_file_as_per_batch_failure(tmp_path: Path): assert report.photos_failed == 1 -def test_describe_all_emits_summary_line_on_partial_failure(tmp_path: Path, capsys): - """Partial-failure runs emit a SUMMARY line; same convention as `media`.""" +def test_describe_all_does_not_emit_summary_on_partial_failure(tmp_path: Path, capsys): + """The orchestrator never emits SUMMARY — the CLI is the single source of truth. + + Mirrors `media.download_all`: SUMMARY lives on `emit_summary_line`, + not on the orchestrator. Asserts the count is exactly zero on a + partial-failure run so a future regression that re-introduces a + second emitter (double-SUMMARY) fails this test. + """ from anthropic import APIError media_root = tmp_path / "media" @@ -745,7 +753,7 @@ def test_describe_all_emits_summary_line_on_partial_failure(tmp_path: Path, caps client=client, ) err = capsys.readouterr().err - assert "SUMMARY: described: 1, failed: 1" in err + assert err.count("SUMMARY:") == 0 def test_describe_all_emits_no_summary_when_all_succeed(tmp_path: Path, capsys): diff --git a/tests/test_executors_api.py b/tests/test_executors_api.py index ca3dd47..7568e80 100644 --- a/tests/test_executors_api.py +++ b/tests/test_executors_api.py @@ -197,7 +197,13 @@ def test_api_executor_emits_no_summary_when_all_succeed(capsys): def _described_photo(*, description: str, decorative: bool = False): - """Build a `MediaPhotoDescribed` for prompt-integration tests.""" + """Build a `MediaPhotoDescribed` for prompt-integration tests. + + `MediaPhotoDescribed` enforces `is_decorative => description == ""` + at the model layer; this helper honours that contract by forcing an + empty description when `decorative=True`, so the caller's `description` + argument is silently dropped in the decorative branch. + """ from datetime import datetime, timezone from xbrain.models import MediaPhotoDescribed @@ -210,7 +216,7 @@ def _described_photo(*, description: str, decorative: bool = False): bytes_size=512, downloaded_at=datetime(2026, 5, 24, tzinfo=timezone.utc), is_decorative=decorative, - description=description, + description="" if decorative else description, description_lang="English", description_version="v1", described_at=datetime(2026, 5, 24, 12, tzinfo=timezone.utc), diff --git a/tests/test_topics.py b/tests/test_topics.py index 75db3a3..52405c1 100644 --- a/tests/test_topics.py +++ b/tests/test_topics.py @@ -198,7 +198,13 @@ def test_build_topic_inputs_rejects_an_unknown_slug(): def _described_photo_in_topic(*, description: str, decorative: bool = False): - """Build a `MediaPhotoDescribed` for the topic-inputs filtering test.""" + """Build a `MediaPhotoDescribed` for the topic-inputs filtering test. + + `MediaPhotoDescribed` enforces `is_decorative => description == ""` + at the model layer; this helper honours that contract by forcing an + empty description when `decorative=True`, so the caller's `description` + argument is silently dropped in the decorative branch. + """ from datetime import datetime, timezone from xbrain.models import MediaPhotoDescribed @@ -211,7 +217,7 @@ def _described_photo_in_topic(*, description: str, decorative: bool = False): bytes_size=512, downloaded_at=datetime(2026, 5, 24, tzinfo=timezone.utc), is_decorative=decorative, - description=description, + description="" if decorative else description, description_lang="English", description_version="v1", described_at=datetime(2026, 5, 24, 12, tzinfo=timezone.utc), From 260c883a98e9c56a3c9fdec3762bff78c673eb13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Victor=20Gonz=C3=A1lez-Pacheco?= Date: Sun, 24 May 2026 20:32:31 +0200 Subject: [PATCH 09/11] [#34] refactor orchestrator + SDK refusal + language drift + Protocol MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Convergent fixes from the round-1 reviewer pipeline. Category 2.1 mixed-language drift (type-design-analyzer + silent-failure-hunter): - `_is_stale` now also returns True when `description_lang` drifted vs the configured output language. Switching `output_language` between Spanish and English no longer leaves stale-language prose persisted — the next run re-describes those entries. - `output_language` parameter is typed `SupportedLanguage`, matching the new `MediaPhotoDescribed.description_lang` field type. The orchestrator now propagates a typed language end-to-end. Category 3 over-decomposed orchestrator (code-simplifier): - Collapsed the four-helper candidate chain (`_tally_skipped` + `_iter_item_candidates` + `_take_with_limit` + outer `_iter_candidates` + inner `_all_eligible` closure) into a single `_iter_eligible_candidates` generator, mirroring `media._iter_eligible_attempts`. A small `_tally_idempotency_skip` helper keeps the loop body under radon grade C; `_filtered_items` keeps the outer scan flat. - Removed `_take_with_limit`, `_chunked`, `_messages_create`, and `_user_directive` as standalone helpers. `itertools.batched(...)` replaces `_chunked`; the SDK call is inlined; the directive is a small dict literal builder local to the block building. Category 1.3 dead defence-in-depth (python-code-reviewer): - Dropped the unreachable duplicate-judgment-index guard in `_apply_batch_judgments`. The parser already rejects duplicate indices, making the dict-construction guard dead code. Category 4 silent-failure regressions (silent-failure-hunter): - 4.1: SDK `stop_reason="refusal"` now transitions every photo in the batch to `MediaPhotoDescribed(is_decorative=True, description="")`. Satisfies the rubric contract at the SDK level so refused batches make progress instead of churning. - 4.2: `JSONDecodeError` now logs `response.stop_reason` so a max-tokens truncation is diagnosable from the warning line alone. - 4.3: Unknown extension in `_media_type` now emits `logger.warning` before falling back to `image/jpeg`. - 4.4: A batch where the model omitted some judgments now bumps `batches_failed += 1` — the batch did not return complete data, so the operator's "did this API call do its job" view is honest. Category 5.3 type design (type-design-analyzer): - Defined `MessagesClient` + `VisionClient` Protocols local to `describe.py`. Dropped the `# type: ignore[attr-defined]` on `client.messages.create(...)`. Category 6 tests (pr-test-analyzer): - 6.1: New full-payload kwarg shape assertion (model, max_tokens, system, role, image+text block ordering, base64 source layout). - 6.2: `test_describe_all_handles_refusal_as_decorative_empty` plus a dedicated `_FakeRefusalResponse` fake. - 6.3: `test_parse_batch_response_strips_fence_without_language_tag`. - 6.4: `--verbose` CLI test exercising the per-failure listing. - 6.5: New mixed-language eligibility test (`test_eligible_described_stale_language_is_eligible`) and CLI partial-failure SUMMARY count test pinning to exactly one. - Plus refactored `_FakeMessagesList` so pre-built response objects (refusal, truncation) bypass the JSON-list wrapper. Category 7 comments + docs: - 7.1: Fixed misleading `_MEDIA_TYPES` comment (was "three", actual cardinality is 4 keys mapping to 3 distinct media types). - 7.2: Rewrote `_run_describe` docstring — coherence is held by the outer `try/finally`, not by `on_progress`. - 7.3: Trimmed the `describe.py` module docstring to 2 paragraphs matching the project voice (was 4 paragraphs, 22 lines). - 7.4: Reworded ARCHITECTURE.md `### describe` ("a new variant on the `MediaEntry` union"). Documented the language-staleness contract in the state-machine doc. Co-Authored-By: Claude Opus 4.7 (1M context) --- ARCHITECTURE.md | 8 +- src/xbrain/cli.py | 12 +- src/xbrain/describe.py | 521 ++++++++++++++++++++++++----------------- tests/test_cli.py | 201 ++++++++++++++++ tests/test_describe.py | 257 +++++++++++++++++++- 5 files changed, 760 insertions(+), 239 deletions(-) diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index 4bdbc15..1fb2340 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -268,7 +268,7 @@ The numbered stages above are summarised; the sections below cover each one in d ### describe -**What it does.** Sends every downloaded photo to a Claude vision model, asks for a 1-3 sentence prose description plus a `is_decorative` classification, and persists the prose on the entry. The entry transitions from `MediaPhotoDownloaded` to `MediaPhotoDescribed` (a fifth variant on the `MediaEntry` union, on top of the four `media` already exposes). Decorative photos (avatars, reaction memes, abstract backgrounds) are classified as such with an empty description so downstream prompts can filter them out without re-classifying. +**What it does.** Sends every downloaded photo to a Claude vision model, asks for a 1-3 sentence prose description plus a `is_decorative` classification, and persists the prose on the entry. The entry transitions from `MediaPhotoDownloaded` to `MediaPhotoDescribed` (a new variant on the `MediaEntry` union). Decorative photos (avatars, reaction memes, abstract backgrounds) are classified as such with an empty description so downstream prompts can filter them out without re-classifying. **Reads.** `data/items.json` + `data/media//.` (the bytes the downloader wrote). @@ -276,10 +276,10 @@ The numbered stages above are summarised; the sections below cover each one in d **State machine.** Each `xbrain describe` run advances eligible photo entries: - `Downloaded` → `Described` (description on the entry, bytes unchanged). -- `Described` (stale version) → `Described` (current version), automatically. -- `Described` (current version) → no-op (skipped) unless `--force`. +- `Described` (stale version OR stale language) → `Described` (current version + current language), automatically. +- `Described` (current version + current language) → no-op (skipped) unless `--force`. -Eligibility ignores `Pending` / `Failed` / `VideoPending`: describe only runs on photos with bytes on disk. The description-version tag is the rubric-evolution lever: bumping `[describe].version` in `config.toml` invalidates persisted entries so the next run re-describes them without `--force`. +Eligibility ignores `Pending` / `Failed` / `VideoPending`: describe only runs on photos with bytes on disk. The description-version tag is the rubric-evolution lever: bumping `[describe].version` in `config.toml` invalidates persisted entries so the next run re-describes them without `--force`. The `description_lang` check is the mixed-vault guard: switching `[paths].output_language` from Spanish → English (or back) marks every previously-described entry stale so the enrich prompt never splices the wrong-language prose into a new vault. **Batching.** Default batch size is 5 images per API call (the spec's quality / cost sweet spot — ~12-15 % token saving vs per-image, modest added complexity). Override with `--batch-size N`. diff --git a/src/xbrain/cli.py b/src/xbrain/cli.py index 3d9e3be..7a26062 100644 --- a/src/xbrain/cli.py +++ b/src/xbrain/cli.py @@ -375,12 +375,12 @@ def _run_describe( Always snapshots `data/` first (the same recovery boundary as `xbrain media`): a botched run — a wrong model, a runaway prompt - — can be undone with `xbrain snapshot restore`. Persistence - happens twice for the same reason as `_run_media`: once after - every batch (the `on_progress` callback fires between batches so - Ctrl-C mid-run leaves `items.json` coherent), and once - unconditionally at the end so the elapsed-described timestamps - are captured even if the orchestrator raised on total failure. + — can be undone with `xbrain snapshot restore`. Coherence on a + Ctrl-C mid-run is held by the outer `try/finally` below, which + saves the store unconditionally even when the orchestrator raises; + the `on_progress` callback is for incremental persistence between + batches on a clean run (so a long describe run never loses more + than one batch of work to a process death). """ if items_filter: target = set(items_filter) diff --git a/src/xbrain/describe.py b/src/xbrain/describe.py index 80a2cb8..fd7d675 100644 --- a/src/xbrain/describe.py +++ b/src/xbrain/describe.py @@ -3,29 +3,23 @@ The `describe_all` orchestrator walks every photo entry the downloader has produced, batches the bytes into vision-API calls (default: 5 images per call to Claude Sonnet), parses the per-image JSON judgments, and -transitions matched entries to `MediaPhotoDescribed`. The persisted -description is consumed by the enrich-time prompt in `executors.api` -and the topic-synth prompt in `topic_synth` — decorative photos are -filtered out at that consumption seam so they introduce no topic noise. - -The structure mirrors `xbrain.executors.api` and `xbrain.topic_synth`: -a recoverable-errors tuple, per-batch failure isolation, -`logger.warning` on every failure, and `RuntimeError` on total failure. -Programmer bugs (`AttributeError`) and `KeyboardInterrupt` propagate -— narrow `except` clauses only. The `SUMMARY: described: N, failed: M, +transitions matched entries to `MediaPhotoDescribed`. Decorative photos +(avatars, reaction memes) are filtered at the downstream consumption +seam so they introduce no topic noise. + +The structure mirrors `xbrain.executors.api` and `xbrain.media`: +recoverable-errors tuple, per-batch failure isolation, `logger.warning` +on every failure, `RuntimeError` on total failure. Programmer bugs and +`KeyboardInterrupt` propagate. The `SUMMARY: described: N, failed: M, skipped: K` stderr line is emitted exclusively by the CLI via -`emit_summary_line` (single source of truth, same shape as `media`). - -I/O dependencies (the Anthropic client, the media root path) are -keyword-injectable so tests run offline. The store mutation is in -place; the caller wraps each transition with a store-write -(the `on_progress` callback fires after every batch). +`emit_summary_line` (single source of truth). """ from __future__ import annotations import base64 import io +import itertools import json import logging import sys @@ -34,11 +28,13 @@ from dataclasses import dataclass, field from datetime import datetime, timezone from pathlib import Path +from typing import Protocol from xbrain.models import ( Item, MediaPhotoDescribed, MediaPhotoDownloaded, + SupportedLanguage, ) from xbrain.rubrics import load_rubric @@ -56,7 +52,10 @@ _MAX_TOKENS = 1200 # Map file extensions to the Anthropic vision media-type strings. The -# downloader writes one of these three (see `xbrain.media._FORMAT_EXTENSIONS`). +# downloader writes one of these four (`.jpg` is mapped twice — once for +# `.jpg`, once for `.jpeg` — so the cardinality of distinct media types +# is three: image/jpeg, image/png, image/webp). See +# `xbrain.media._FORMAT_EXTENSIONS` for the producer side. _MEDIA_TYPES: dict[str, str] = { ".jpg": "image/jpeg", ".jpeg": "image/jpeg", @@ -65,6 +64,32 @@ } +class MessagesClient(Protocol): + """Minimal structural type for the Anthropic SDK `client.messages` seam. + + Defined locally so the orchestrator does not need to type-ignore an + untyped `client.messages.create(...)` call. The real + `anthropic.Anthropic().messages` satisfies this Protocol; the test + fakes (`tests.conftest.FakeAnthropic`, `_FakeVisionClient`) do too. + """ + + def create(self, **kwargs: object) -> object: + """Send one Anthropic `messages.create` call; return the raw response.""" + ... + + +class VisionClient(Protocol): + """Minimal structural type for the Anthropic SDK client itself. + + Drops the `# type: ignore[attr-defined]` that would otherwise be + needed on `client.messages.create(...)`. The protocol is local to + this module because describe is the only consumer; `executors.api` + still uses a duck-typed client for now (out of scope for this PR). + """ + + messages: MessagesClient + + def _utcnow() -> datetime: """Default `now` clock — UTC-aware `datetime.now()`. @@ -83,9 +108,10 @@ class DescribeReport: no-op re-run with no version bump must report every previously described photo here. `batches_attempted` counts API calls actually issued; `batches_failed` counts the ones the recoverable-errors - tuple swallowed. A run with `photos_attempted > 0 and - photos_described == 0` raises before this report leaves the - orchestrator. + tuple swallowed PLUS batches where the SDK refused or the model + returned fewer judgments than the batch size. A run with + `photos_attempted > 0 and photos_described == 0` raises before + this report leaves the orchestrator. """ items_processed: int = 0 @@ -107,8 +133,8 @@ class _Candidate: Holds back-references to `Item` and the media-list `index` so the orchestrator can swap the transitioned variant back into place - without re-scanning the store. `bytes_data` is loaded lazily by - `_load_bytes` — failing to read the file is a per-photo failure, + without re-scanning the store. Bytes are loaded lazily by + `_load_bytes` — failing to read the file is a per-batch failure, not a total-run abort. """ @@ -142,32 +168,44 @@ def _recoverable_errors() -> tuple[type[Exception], ...]: return (ValueError, json.JSONDecodeError, KeyError, OSError) -def _is_stale(entry: MediaPhotoDescribed, *, current_version: str) -> bool: - """A described entry is stale when its version no longer matches the config. +def _is_stale( + entry: MediaPhotoDescribed, + *, + current_version: str, + current_language: str, +) -> bool: + """A described entry is stale when its version OR language drifted. + + Two triggers, both no-`--force`: + + 1. `description_version != current_version` — bumping + `[describe].version` in `config.toml` invalidates the corpus + against a new rubric. + 2. `description_lang != current_language` — switching + `[paths].output_language` from Spanish to English (or back) + leaves stale-language prose in place that would otherwise + drift into the enrich prompt as a mixed-language vault. - Bumping `describe_version` in `config.toml` is the manual trigger to - re-describe the whole corpus against a new rubric — no `--force` - needed. The version check is exact-string: there is no ordering - relation between versions, only equality, so a deliberate downgrade - is also a "describe again" signal. + Equality is exact-string for both: there is no ordering relation + between versions or languages, so a deliberate downgrade also + triggers re-describe. """ - return entry.description_version != current_version + if entry.description_version != current_version: + return True + return entry.description_lang != current_language -def _eligible( - entry: object, - *, - force: bool, - current_version: str, +def _is_eligible( + entry: object, *, force: bool, current_version: str, current_language: str ) -> bool: """Decide whether `describe_all` should attempt this entry on THIS run. `MediaPhotoDownloaded` entries are always eligible — they have not been described yet by definition. `MediaPhotoDescribed` entries are - eligible only when `--force` is set or the persisted version is - stale vs the current `describe_version` config. Every other - variant (`MediaPhotoPending`, `MediaPhotoFailed`, `MediaVideoPending`) - is out of scope — describing only runs over photos whose bytes are + eligible only when `--force` is set or the persisted version/language + is stale vs the current config. Every other variant + (`MediaPhotoPending`, `MediaPhotoFailed`, `MediaVideoPending`) is + out of scope — describing only runs over photos whose bytes are already on disk. """ if isinstance(entry, MediaPhotoDownloaded): @@ -175,139 +213,130 @@ def _eligible( if isinstance(entry, MediaPhotoDescribed): if force: return True - return _is_stale(entry, current_version=current_version) + return _is_stale( + entry, + current_version=current_version, + current_language=current_language, + ) return False -def _tally_skipped( +def _tally_idempotency_skip( entry: object, *, current_version: str, + current_language: str, report: DescribeReport, ) -> None: - """Bump `photos_skipped_already_described` when this entry is a no-op skip. + """Bump `photos_skipped_already_described` for same-version+same-language describes. - Pulled out of the candidate iterator so the loop body in - `_iter_candidates` keeps a low complexity grade. Only described - entries on the current version count as skips — pending/failed/ - video entries are silently out of scope for `xbrain describe`. + Only fresh `MediaPhotoDescribed` entries count as idempotency skips. + Pending/failed/video entries are silently out of scope for + `xbrain describe`. Pulled out of the candidate iterator so the loop + body stays under radon grade C. """ if isinstance(entry, MediaPhotoDescribed) and not _is_stale( - entry, current_version=current_version + entry, + current_version=current_version, + current_language=current_language, ): report.photos_skipped_already_described += 1 -def _iter_item_candidates( - item_id: str, - item: Item, - *, - force: bool, - current_version: str, - report: DescribeReport, -) -> Iterator[_Candidate]: - """Yield every eligible candidate inside one item's media list. +def _filtered_items( + items: dict[str, Item], + items_filter: set[str] | None, +) -> Iterator[tuple[str, Item]]: + """Yield `(item_id, item)` pairs the run should scan, skipping empty media. - Pulled out of the cross-item iterator so the outer scan stays - flat. Per-entry tallies (skips) go on the report; the caller - decides whether the item itself counts as processed by checking - if this iterator yielded anything. + Pulled out so `_iter_eligible_candidates` stays under radon grade C. + An `items_filter` of `None` is "every item"; an empty media list + is silently skipped (no photos to consider). """ - for index, entry in enumerate(item.media): - if not _eligible(entry, force=force, current_version=current_version): - _tally_skipped(entry, current_version=current_version, report=report) + for item_id, item in items.items(): + if items_filter is not None and item_id not in items_filter: continue - assert isinstance(entry, (MediaPhotoDownloaded, MediaPhotoDescribed)) - yield _Candidate(item_id=item_id, item=item, index=index, entry=entry) - - -def _take_with_limit(candidates: Iterator[_Candidate], limit: int | None) -> Iterator[_Candidate]: - """Yield from `candidates`, stopping after `limit` items. - - Pulled out so `_iter_candidates` does not interleave the - limit-countdown with the per-item bookkeeping. `None` means "no - limit" — yields everything. - """ - if limit is None: - yield from candidates - return - remaining = limit - for candidate in candidates: - if remaining <= 0: - return - remaining -= 1 - yield candidate + if not item.media: + continue + yield item_id, item -def _iter_candidates( +def _iter_eligible_candidates( items: dict[str, Item], *, force: bool, limit: int | None, items_filter: set[str] | None, current_version: str, + current_language: str, report: DescribeReport, ) -> Iterator[_Candidate]: - """Yield each candidate eligible for description on THIS run. - - Side effects on `report`: bumps `items_processed` once per item - that contributes at least one yielded candidate, and bumps - `photos_skipped_already_described` for each `MediaPhotoDescribed` - entry passed over (via `_tally_skipped`). Stops yielding once - `limit` is exhausted. - - `items_processed` is bumped on the FIRST yielded candidate of an - item, not at the end of its scan — that way a `limit` that - truncates mid-item still counts the item as processed (work - happened on it). Items whose every photo was skipped do NOT count - as processed. - """ - seen_item_ids: set[str] = set() + """Yield each candidate eligible for description, with all bookkeeping inline. - def _all_eligible() -> Iterator[_Candidate]: - for item_id, item in items.items(): - if items_filter is not None and item_id not in items_filter: - continue - if not item.media: - continue - yield from _iter_item_candidates( - item_id, - item, - force=force, - current_version=current_version, - report=report, - ) - - for candidate in _take_with_limit(_all_eligible(), limit): - if candidate.item_id not in seen_item_ids: - seen_item_ids.add(candidate.item_id) - report.items_processed += 1 - yield candidate + Side effects on `report` (mirrors `media._iter_eligible_attempts`): + - bumps `items_processed` once per item that contributes at least + one yielded candidate (first yielded candidate of an item — a + `limit` that truncates mid-item still counts the item, items + whose every photo was skipped do NOT count); + - bumps `photos_skipped_already_described` for each + `MediaPhotoDescribed` entry on the current version+language that + gets passed over (via `_tally_idempotency_skip`). -def _chunked(candidates: list[_Candidate], size: int) -> Iterator[list[_Candidate]]: - """Split a candidate list into batches of at most `size` entries. - - `size` is guaranteed positive by the CLI layer (`Typer` validates - integer ranges); this helper does not re-validate. An empty input - yields nothing — the orchestrator never issues a no-op API call. + Stops yielding once `limit` is exhausted. Replaces the four-helper + chain that was here before (`_tally_skipped` + `_iter_item_candidates` + + `_take_with_limit` + outer `_iter_candidates`). """ - for start in range(0, len(candidates), size): - yield candidates[start : start + size] + remaining = limit + seen_item_ids: set[str] = set() + for item_id, item in _filtered_items(items, items_filter): + for index, entry in enumerate(item.media): + if remaining is not None and remaining <= 0: + return + if not _is_eligible( + entry, + force=force, + current_version=current_version, + current_language=current_language, + ): + _tally_idempotency_skip( + entry, + current_version=current_version, + current_language=current_language, + report=report, + ) + continue + assert isinstance(entry, (MediaPhotoDownloaded, MediaPhotoDescribed)) + if item_id not in seen_item_ids: + seen_item_ids.add(item_id) + report.items_processed += 1 + if remaining is not None: + remaining -= 1 + yield _Candidate(item_id=item_id, item=item, index=index, entry=entry) def _media_type(local_path: str) -> str: """Map an on-disk path's extension to its Anthropic media-type string. - The downloader writes one of `.jpg` / `.png` / `.webp` (see - `xbrain.media._FORMAT_EXTENSIONS`). Anything else came from a hand - edit of `items.json` or a future format we have not registered — - fall back to `image/jpeg` and let Anthropic reject it if the bytes - do not match. The fall-back keeps a per-photo failure out of the - total-failure raise path. + The downloader writes one of `.jpg` / `.jpeg` / `.png` / `.webp` + (see `xbrain.media._FORMAT_EXTENSIONS`). Anything else came from a + hand edit of `items.json` or a future format we have not registered + — we emit a `logger.warning` (so the operator can see the wrong + MIME type was sent) and fall back to `image/jpeg`. Anthropic will + reject the request if the bytes do not match; that surfaces as a + per-batch failure rather than a silent total-failure raise. """ suffix = Path(local_path).suffix.lower() - return _MEDIA_TYPES.get(suffix, "image/jpeg") + media_type = _MEDIA_TYPES.get(suffix) + if media_type is None: + logger.warning( + "describe: unknown extension %r for local_path %s; " + "sending as image/jpeg (Anthropic may reject)", + suffix, + local_path, + ) + return "image/jpeg" + return media_type def _load_bytes(media_root: Path, local_path: str) -> bytes: @@ -315,7 +344,7 @@ def _load_bytes(media_root: Path, local_path: str) -> bytes: Raises `OSError` (a `FileNotFoundError` subclass when the file is missing). The orchestrator's `_recoverable_errors` tuple catches it - so a missing file is a per-photo failure (the operator can re-run + so a missing file is a per-batch failure (the operator can re-run `xbrain media` to repopulate), never a whole-batch abort. """ return (media_root / local_path).read_bytes() @@ -343,20 +372,6 @@ def _system_prompt(language: str) -> str: return load_rubric("describe-image", language=language) -def _user_directive(batch_size: int) -> str: - """Plain-text directive that follows the image blocks in the user turn. - - Tells the model the index range so the JSON list it emits is - contractually one-to-one with the input order. Indices are - zero-based to match Python and the rubric's example. - """ - last = batch_size - 1 - return ( - f"Describe images 0 through {last}. " - "Return a JSON list with one entry per image, in the order received." - ) - - def _extract_response_text(response: object) -> str: """Pull the JSON-bearing text out of a vision response, stripping fences. @@ -364,8 +379,8 @@ def _extract_response_text(response: object) -> str: of typed blocks; only `text` blocks carry JSON. Some models wrap the JSON in a ```json ... ``` Markdown fence despite the rubric explicitly forbidding it — strip a single leading/trailing fence - pair so the downstream `json.loads` does not trip on a Markdown - artefact. + pair (with or without a language tag) so the downstream + `json.loads` does not trip on a Markdown artefact. """ blocks = [b for b in getattr(response, "content", []) if getattr(b, "type", None) == "text"] if not blocks: @@ -420,7 +435,9 @@ def _parse_batch_response(response: object, batch_size: int) -> list[dict]: with shape (list of judgments with unique indices), and `_validate_judgment_entry` deals with per-entry typing. Each step raises `ValueError` so the orchestrator's `_recoverable_errors` - tuple catches every shape violation as a per-batch failure. + tuple catches every shape violation as a per-batch failure. The + caller logs `response.stop_reason` alongside any `JSONDecodeError` + so a `max_tokens` truncation surfaces as a diagnosable cause. """ text = _extract_response_text(response) decoded = json.loads(text) @@ -442,7 +459,7 @@ def _apply_judgment( candidate: _Candidate, judgment: dict, *, - language: str, + language: SupportedLanguage, version: str, described_at: datetime, ) -> MediaPhotoDescribed: @@ -496,71 +513,110 @@ def _record_batch_failure( ) +def _apply_batch_as_decorative_empty( + *, + batch: list[_Candidate], + language: SupportedLanguage, + version: str, + described_at: datetime, + report: DescribeReport, +) -> None: + """Mark every photo in a refused batch as decorative + empty description. + + The Anthropic SDK returns `stop_reason="refusal"` on safety-policy + refusals (identifiable faces, NSFW, ...). The rubric already says + "if undescribable, mark decorative with empty description" — we + apply that at the SDK level too, so the batch makes progress + instead of churning forever. Each photo transitions to + `MediaPhotoDescribed(is_decorative=True, description="")` and + counts toward `photos_described`, not `photos_failed`. + """ + for candidate in batch: + new_entry = MediaPhotoDescribed( + url=candidate.entry.url, + local_path=candidate.entry.local_path, + width=candidate.entry.width, + height=candidate.entry.height, + bytes_size=candidate.entry.bytes_size, + downloaded_at=candidate.entry.downloaded_at, + is_decorative=True, + description="", + description_lang=language, + description_version=version, + described_at=described_at, + ) + candidate.item.media[candidate.index] = new_entry + report.photos_described += 1 + + def describe_all( items: dict[str, Item], media_root: Path, *, model: str, - output_language: str, + output_language: SupportedLanguage, description_version: str, force: bool = False, limit: int | None = None, items_filter: list[str] | None = None, batch_size: int = _DEFAULT_BATCH_SIZE, - client: object | None = None, + client: VisionClient | None = None, on_progress: Callable[[], None] | None = None, now: Callable[[], datetime] | None = None, ) -> DescribeReport: """Describe every eligible photo across the store; return a structured report. Eligibility (without `--force`): + - `MediaPhotoDownloaded` — always. - - `MediaPhotoDescribed` whose `description_version` ≠ `description_version` - (the per-call argument, sourced from `config.describe_version`). + - `MediaPhotoDescribed` whose `description_version` ≠ the configured + version, OR whose `description_lang` ≠ the configured language. + The language check guards against mixed-language vaults that + would otherwise leak Spanish prose into an English run (or + vice-versa) via the enrich prompt. With `--force`: + - Every `MediaPhotoDownloaded` AND every `MediaPhotoDescribed` is re-described. The persisted description is overwritten. Out of scope (every run): - - `MediaPhotoPending` — describing only runs over photos with bytes - on disk; the operator must call `xbrain media` first. - - `MediaPhotoFailed` — same reason. - - `MediaVideoPending` — photos only; vision-API support for video - is a separate phase. + + - `MediaPhotoPending`, `MediaPhotoFailed`, `MediaVideoPending` — + describing only runs over photos with bytes on disk. The function mutates `items` in place; the caller is expected to wrap each batch transition with a store write (the `on_progress` - callback fires after every batch, success or failure). The - Ctrl-C-coherent invariant lives there: the store is written - between batches, never mid-API-call. - - `media_root` is the directory under which `/.` - photo files live (typically `data/media/`). + callback fires after every batch). The Ctrl-C-coherent invariant + lives there: the store is written between batches. Raises: - RuntimeError: when EVERY batch attempted in the run fails. A - total failure (a revoked API key, an exhausted quota, a - corrupted media tree) must surface as a non-zero exit. The + RuntimeError: when EVERY batch attempted in the run fails. The CLI's `_handle_cli_errors` converts this into a clean operator message + exit code 1. """ if client is None: from anthropic import Anthropic # lazy: tests inject FakeAnthropic - client = Anthropic() # reads ANTHROPIC_API_KEY from the environment + # reads ANTHROPIC_API_KEY from the environment; the real `Anthropic` + # class is structurally compatible with `VisionClient` so the cast + # is a documentation tool, not a runtime conversion. + active_client: VisionClient = Anthropic() # type: ignore[assignment] + else: + active_client = client clock: Callable[[], datetime] = now if now is not None else _utcnow started = time.monotonic() report = DescribeReport() filter_set = set(items_filter) if items_filter else None candidates = list( - _iter_candidates( + _iter_eligible_candidates( items, force=force, limit=limit, items_filter=filter_set, current_version=description_version, + current_language=output_language, report=report, ) ) @@ -571,10 +627,14 @@ def describe_all( system = _system_prompt(output_language) recoverable = _recoverable_errors() - for batch in _chunked(candidates, batch_size): + # `itertools.batched` requires Python 3.12 (declared in `pyproject.toml`). + # It returns tuples; the orchestrator works in lists so it can use `len` + # and slice freely. + for batch_tuple in itertools.batched(candidates, batch_size): + batch = list(batch_tuple) _run_one_batch( batch=batch, - client=client, + client=active_client, model=model, system=system, output_language=output_language, @@ -592,13 +652,29 @@ def describe_all( return report +def _user_directive_text(batch_size: int) -> dict: + """Build the trailing text block for the user turn — index-range directive. + + Tells the model the index range so the JSON list it emits is + contractually one-to-one with the input order. Indices are + zero-based to match Python and the rubric's example. + """ + return { + "type": "text", + "text": ( + f"Describe images 0 through {batch_size - 1}. " + "Return a JSON list with one entry per image, in the order received." + ), + } + + def _run_one_batch( *, batch: list[_Candidate], - client: object, + client: VisionClient, model: str, system: str, - output_language: str, + output_language: SupportedLanguage, description_version: str, clock: Callable[[], datetime], media_root: Path, @@ -609,6 +685,11 @@ def _run_one_batch( Pulled out of `describe_all` to keep the outer loop's complexity under grade C while preserving the per-batch isolation contract. + Refusal responses (`stop_reason="refusal"`) are converted to a + decorative-empty transition for each photo in the batch — that + satisfies the spec's "if undescribable, mark decorative" rule at + the SDK level so the run makes progress instead of churning. + Programmer bugs (`AttributeError`, ...) and `KeyboardInterrupt` fall outside `recoverable` and propagate. """ @@ -616,14 +697,39 @@ def _run_one_batch( report.photos_attempted += len(batch) try: content_blocks = _build_user_blocks(batch, media_root) - response = _messages_create( - client=client, + response = client.messages.create( model=model, max_tokens=_MAX_TOKENS, system=system, - content_blocks=content_blocks, + messages=[{"role": "user", "content": content_blocks}], ) - judgments = _parse_batch_response(response, batch_size=len(batch)) + stop_reason = getattr(response, "stop_reason", None) + if stop_reason == "refusal": + logger.warning( + "describe: batch of %d photos refused by SDK; " + "marking each as decorative with empty description", + len(batch), + ) + _apply_batch_as_decorative_empty( + batch=batch, + language=output_language, + version=description_version, + described_at=clock(), + report=report, + ) + return + try: + judgments = _parse_batch_response(response, batch_size=len(batch)) + except json.JSONDecodeError as exc: + # Surface stop_reason so a `max_tokens` truncation is + # diagnosable from the warning line alone — otherwise the + # operator gets a generic "Expecting value: line N column M". + logger.warning( + "describe: malformed JSON from vision response (stop_reason=%r): %s", + stop_reason, + exc, + ) + raise _apply_batch_judgments( batch=batch, judgments=judgments, @@ -667,60 +773,38 @@ def _build_user_blocks(batch: list[_Candidate], media_root: Path) -> list[dict]: for candidate in batch: data = _load_bytes(media_root, candidate.entry.local_path) blocks.append(_build_image_block(data, _media_type(candidate.entry.local_path))) - blocks.append({"type": "text", "text": _user_directive(len(batch))}) + blocks.append(_user_directive_text(len(batch))) return blocks -def _messages_create( - *, - client: object, - model: str, - max_tokens: int, - system: str, - content_blocks: list[dict], -) -> object: - """Thin wrapper around the Anthropic SDK's `messages.create`. - - Pulled out so the orchestrator stays readable and so tests can - inject a `FakeAnthropic` whose `messages.create` records kwargs. - Returns the raw response — the parser inspects `.content` blocks. - """ - return client.messages.create( # type: ignore[attr-defined] - model=model, - max_tokens=max_tokens, - system=system, - messages=[{"role": "user", "content": content_blocks}], - ) - - def _apply_batch_judgments( *, batch: list[_Candidate], judgments: list[dict], - language: str, + language: SupportedLanguage, version: str, described_at: datetime, report: DescribeReport, ) -> None: """Apply per-image judgments to the batch, transitioning each entry. - Pairs each judgment with the candidate at its `index`. A judgment - count that does not match the batch size is a contract violation - (parser already enforces the index range) — if the list is short, - the missing candidates are tallied as per-photo failures so they - re-enter the candidate pool on the next run; if it is long, the - excess is a programmer bug (parser-side dup check rules out - duplicate indices already). + Pairs each judgment with the candidate at its `index`. The parser + already enforces the index range and rejects duplicates, so the + `by_index` dict is built directly without a paranoid re-check. + + A judgment list SHORTER than the batch is a partial-data contract + violation: the missing candidates are tallied as per-photo failures + (so they re-enter the candidate pool on the next run) AND the + batch counts as `batches_failed += 1` even though some judgments + landed — the batch did not return complete data, so the operator's + view of "did this API call do its job" is "no, partially". """ by_index = {entry["index"]: entry for entry in judgments} - if len(by_index) != len(judgments): - # Defence in depth — `_parse_batch_response` already rejects - # duplicate indices, so reaching here means the parser was - # bypassed. Raise so the developer sees it. - raise ValueError("internal: duplicate judgment indices survived parser check") + batch_had_omission = False for position, candidate in enumerate(batch): judgment = by_index.get(position) if judgment is None: + batch_had_omission = True report.photos_failed += 1 report.per_item_failures.setdefault(candidate.item_id, []).append( (candidate.entry.url, f"vision response omitted index {position}") @@ -735,6 +819,11 @@ def _apply_batch_judgments( ) candidate.item.media[candidate.index] = new_entry report.photos_described += 1 + if batch_had_omission: + # A batch with missing judgments did not return complete data + # for the API call we made — count it as a batch failure so + # the "did this run net positive?" check is honest. + report.batches_failed += 1 def emit_summary_line(report: DescribeReport, *, out: "io.IOBase | None" = None) -> None: diff --git a/tests/test_cli.py b/tests/test_cli.py index d36d04b..1e6ec6f 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -979,3 +979,204 @@ def _patched(store, media_root, **kwargs): result = runner.invoke(app, ["describe"]) assert result.exit_code == 1 assert "Error" in result.output + + +def test_describe_command_emits_exactly_one_summary_on_partial_failure(tmp_path, monkeypatch): + """The CLI is the single source of truth for the SUMMARY line. + + A partial-failure run must emit EXACTLY one `SUMMARY:` on stderr — + if the orchestrator regrows a second emitter the count goes to 2 + and this pins it. Mirrors the dedup test on the orchestrator + side (`test_describe_all_does_not_emit_summary_on_partial_failure`). + """ + import io as _io + import json as _json + from datetime import datetime, timezone + + from anthropic import APIError + from PIL import Image + + from xbrain.models import Author, Item, MediaPhotoDownloaded + from xbrain.store import save_store as _save_store + + _setup_repo(tmp_path, monkeypatch) + media_root = tmp_path / "data" / "media" + (media_root / "1").mkdir(parents=True, exist_ok=True) + (media_root / "2").mkdir(parents=True, exist_ok=True) + buf = _io.BytesIO() + Image.new("RGB", (8, 6), color=(10, 20, 30)).save(buf, format="JPEG") + (media_root / "1" / "0.jpg").write_bytes(buf.getvalue()) + (media_root / "2" / "0.jpg").write_bytes(buf.getvalue()) + + def _build_item(item_id: str) -> Item: + return Item( + id=item_id, + source="bookmark", + url=f"https://x.com/a/status/{item_id}", + author=Author(handle="a", name="A"), + text="text", + created_at=datetime(2026, 5, 10, tzinfo=timezone.utc), + captured_at=datetime(2026, 5, 24, tzinfo=timezone.utc), + media=[ + MediaPhotoDownloaded( + url=f"https://pbs.twimg.com/media/{item_id}-0.jpg", + local_path=f"{item_id}/0.jpg", + width=8, + height=6, + bytes_size=200, + downloaded_at=datetime(2026, 5, 24, tzinfo=timezone.utc), + ) + ], + ) + + _save_store( + {"1": _build_item("1"), "2": _build_item("2")}, + tmp_path / "data" / "items.json", + ) + + class _PartialFailMessages: + def __init__(self): + self.calls: list[dict] = [] + self._counter = 0 + + def create(self, **kwargs): + self.calls.append(kwargs) + self._counter += 1 + if self._counter == 1: + # First batch fails. + raise APIError("503", request=None, body=None) + + # Second batch returns one judgment. + class _Block: + type = "text" + + def __init__(self, text: str): + self.text = text + + class _Resp: + def __init__(self, text: str): + self.content = [_Block(text)] + self.stop_reason = "end_turn" + + return _Resp(_json.dumps([{"index": 0, "is_decorative": False, "description": "ok"}])) + + class _FakeClient: + def __init__(self): + self.messages = _PartialFailMessages() + + import xbrain.cli as cli + + fake = _FakeClient() + + def _patched(store, media_root, **kwargs): + kwargs["client"] = fake + kwargs["batch_size"] = 1 + return _orig(store, media_root, **kwargs) + + _orig = cli.run_describe_all + monkeypatch.setattr(cli, "run_describe_all", _patched) + + result = runner.invoke(app, ["describe"]) + assert result.exit_code == 0 + # Exactly one SUMMARY emission — the CLI's `emit_summary_line` is + # the single source of truth; the orchestrator stays silent. + assert result.output.count("SUMMARY:") == 1 + + +def test_describe_command_verbose_lists_failed_photos(tmp_path, monkeypatch): + """`--verbose` prints `Failed photos:` plus per-failure rows on partial failure. + + Pins the diagnostic branch in `_run_describe` so a future refactor + that drops the verbose output is caught. + """ + import io as _io + import json as _json + from datetime import datetime, timezone + + from anthropic import APIError + from PIL import Image + + from xbrain.models import Author, Item, MediaPhotoDownloaded + from xbrain.store import save_store as _save_store + + _setup_repo(tmp_path, monkeypatch) + media_root = tmp_path / "data" / "media" + (media_root / "1").mkdir(parents=True, exist_ok=True) + (media_root / "2").mkdir(parents=True, exist_ok=True) + buf = _io.BytesIO() + Image.new("RGB", (8, 6), color=(10, 20, 30)).save(buf, format="JPEG") + (media_root / "1" / "0.jpg").write_bytes(buf.getvalue()) + (media_root / "2" / "0.jpg").write_bytes(buf.getvalue()) + + def _build_item(item_id: str) -> Item: + return Item( + id=item_id, + source="bookmark", + url=f"https://x.com/a/status/{item_id}", + author=Author(handle="a", name="A"), + text="text", + created_at=datetime(2026, 5, 10, tzinfo=timezone.utc), + captured_at=datetime(2026, 5, 24, tzinfo=timezone.utc), + media=[ + MediaPhotoDownloaded( + url=f"https://pbs.twimg.com/media/{item_id}-0.jpg", + local_path=f"{item_id}/0.jpg", + width=8, + height=6, + bytes_size=200, + downloaded_at=datetime(2026, 5, 24, tzinfo=timezone.utc), + ) + ], + ) + + _save_store( + {"1": _build_item("1"), "2": _build_item("2")}, + tmp_path / "data" / "items.json", + ) + + class _PartialFailMessages: + def __init__(self): + self.calls: list[dict] = [] + self._counter = 0 + + def create(self, **kwargs): + self.calls.append(kwargs) + self._counter += 1 + if self._counter == 1: + raise APIError("503", request=None, body=None) + + class _Block: + type = "text" + + def __init__(self, text: str): + self.text = text + + class _Resp: + def __init__(self, text: str): + self.content = [_Block(text)] + self.stop_reason = "end_turn" + + return _Resp(_json.dumps([{"index": 0, "is_decorative": False, "description": "ok"}])) + + class _FakeClient: + def __init__(self): + self.messages = _PartialFailMessages() + + import xbrain.cli as cli + + fake = _FakeClient() + + def _patched(store, media_root, **kwargs): + kwargs["client"] = fake + kwargs["batch_size"] = 1 + return _orig(store, media_root, **kwargs) + + _orig = cli.run_describe_all + monkeypatch.setattr(cli, "run_describe_all", _patched) + + result = runner.invoke(app, ["describe", "--verbose"]) + assert result.exit_code == 0 + assert "Failed photos:" in result.output + # The failed item id (1 or 2 — order is filesystem-dependent) and the URL + # both appear on the verbose row. + assert "pbs.twimg.com" in result.output diff --git a/tests/test_describe.py b/tests/test_describe.py index 5072b95..d6aeaae 100644 --- a/tests/test_describe.py +++ b/tests/test_describe.py @@ -27,7 +27,7 @@ from xbrain.describe import ( DescribeReport, - _eligible, + _is_eligible, _parse_batch_response, _validate_judgment_entry, describe_all, @@ -163,14 +163,50 @@ class _FakeListResponse: `json.dumps(payload)` — that path expects a dict. For describe we need to ship a list payload, so we build a parallel response type that mirrors the shape `_parse_batch_response` consumes. + + `stop_reason` defaults to `"end_turn"` (the SDK's normal terminus); + pass `stop_reason="refusal"` to simulate a safety-policy refusal, + or any other string to simulate a max-tokens truncation etc. """ - def __init__(self, judgments: list[dict]): + def __init__(self, judgments: list[dict], *, stop_reason: str = "end_turn"): import json self.content = [type(FakeBlock(payload={}))(payload={})] # placeholder block # Overwrite the text on the block with the JSON list serialisation. self.content[0].text = json.dumps(judgments) + self.stop_reason = stop_reason + + +class _FakeRefusalResponse: + """A response object simulating Anthropic SDK refusal. + + `stop_reason="refusal"` is the SDK's signal that the model declined + to answer (identifiable face, NSFW, ...). The orchestrator must + detect this and convert the batch to decorative+empty rather than + fail it. `content` is intentionally empty — a real refusal response + can carry no text blocks. + """ + + def __init__(self): + self.content: list = [] + self.stop_reason = "refusal" + + +class _FakeTruncatedResponse: + """A response with `stop_reason="max_tokens"` and malformed JSON text. + + Simulates the failure mode where the model hit the token cap mid-emit + and the SDK returns a truncated, unparseable JSON fragment. The + orchestrator must surface `stop_reason` in the warning log line so + the operator can diagnose the cause from logs alone. + """ + + def __init__(self, truncated_text: str = '[{"index": 0, "is_dec'): + block = type(FakeBlock(payload={}))(payload={}) + block.text = truncated_text + self.content = [block] + self.stop_reason = "max_tokens" class _FakeMessagesList: @@ -178,18 +214,25 @@ class _FakeMessagesList: Mirrors `tests.conftest.FakeMessages` but uses `_FakeListResponse` so the payload is a JSON list. Exception instances are raised - rather than wrapped (same convention). + rather than wrapped (same convention). A payload that is already a + response object (e.g. `_FakeRefusalResponse`) is returned as-is so + tests can simulate stop-reason variants without a wrapper. """ def __init__(self, payloads: list): self._payloads = list(payloads) self.calls: list[dict] = [] - def create(self, **kwargs) -> _FakeListResponse: + def create(self, **kwargs): self.calls.append(kwargs) payload = self._payloads.pop(0) if isinstance(payload, Exception): raise payload + # Pass pre-built response objects through unchanged so callers + # can ship refusal / truncation / custom-stop-reason responses + # without a `_FakeListResponse` wrapper around a JSON list. + if hasattr(payload, "content") and hasattr(payload, "stop_reason"): + return payload return _FakeListResponse(payload) @@ -204,23 +247,43 @@ def __init__(self, payloads: list): def test_eligible_downloaded_always_eligible(): - """Downloaded entries are always eligible regardless of force/version.""" + """Downloaded entries are always eligible regardless of force/version/language.""" entry = _downloaded() - assert _eligible(entry, force=False, current_version="v1") is True - assert _eligible(entry, force=True, current_version="v1") is True + assert ( + _is_eligible(entry, force=False, current_version="v1", current_language="English") is True + ) + assert _is_eligible(entry, force=True, current_version="v1", current_language="English") is True def test_eligible_described_current_version_only_with_force(): - """A described entry on the current version is skipped without --force.""" + """A described entry on the current version+language is skipped without --force.""" entry = _described(version="v1") - assert _eligible(entry, force=False, current_version="v1") is False - assert _eligible(entry, force=True, current_version="v1") is True + assert ( + _is_eligible(entry, force=False, current_version="v1", current_language="English") is False + ) + assert _is_eligible(entry, force=True, current_version="v1", current_language="English") is True def test_eligible_described_stale_version_is_eligible(): """A described entry on a stale version is eligible without --force.""" entry = _described(version="v1") - assert _eligible(entry, force=False, current_version="v2") is True + assert ( + _is_eligible(entry, force=False, current_version="v2", current_language="English") is True + ) + + +def test_eligible_described_stale_language_is_eligible(): + """A described entry whose language drifted vs the current config is eligible. + + Guards against the mixed-language vault regression: switching + `output_language` from English → Spanish (or vice-versa) must mark + previously-described entries stale on the next run so the enrich + prompt does not splice the wrong language into the new vault. + """ + entry = _described(version="v1") # described_lang="English" by default + assert ( + _is_eligible(entry, force=False, current_version="v1", current_language="Spanish") is True + ) def test_eligible_pending_failed_video_are_never_eligible(): @@ -235,8 +298,14 @@ def test_eligible_pending_failed_video_are_never_eligible(): ) video = MediaVideoPending(url="https://video.twimg.com/x.mp4") for entry in (pending, failed, video): - assert _eligible(entry, force=False, current_version="v1") is False - assert _eligible(entry, force=True, current_version="v1") is False + assert ( + _is_eligible(entry, force=False, current_version="v1", current_language="English") + is False + ) + assert ( + _is_eligible(entry, force=True, current_version="v1", current_language="English") + is False + ) # --------------------------------------------------------------------- parser @@ -909,6 +978,168 @@ def test_describe_all_uses_injectable_clock(tmp_path: Path): assert described.described_at == fixed +# --------------------------------------------------------------------- payload shape + + +def test_describe_all_sends_payload_with_expected_kwarg_shape(tmp_path: Path): + """The kwargs sent to `messages.create` must match the wire contract. + + A regression in the payload shape (wrong `max_tokens`, dropped + `system`, role flip, image-block restructure) is otherwise invisible + until production. Pins the structure end-to-end. + """ + media_root = tmp_path / "media" + item = _item( + "1", + [_downloaded(item_id="1", index=i, media_root=media_root) for i in range(2)], + ) + store = {"1": item} + client = _FakeVisionClient([[_judgment(0), _judgment(1)]]) + describe_all( + store, + media_root, + model="claude-sonnet-4-6", + output_language="English", + description_version="v1", + batch_size=2, + client=client, + ) + kwargs = client.messages.calls[0] + assert kwargs["model"] == "claude-sonnet-4-6" + assert kwargs["max_tokens"] == 1200 + assert isinstance(kwargs["system"], str) and kwargs["system"] + messages = kwargs["messages"] + assert isinstance(messages, list) and len(messages) == 1 + assert messages[0]["role"] == "user" + content = messages[0]["content"] + # 2 image blocks + exactly one trailing text directive. + assert len(content) == 3 + image_blocks = content[:2] + for block in image_blocks: + assert block["type"] == "image" + source = block["source"] + assert source["type"] == "base64" + assert source["media_type"] == "image/jpeg" + assert isinstance(source["data"], str) and source["data"] + text_block = content[2] + assert text_block["type"] == "text" + assert "Describe images 0 through 1" in text_block["text"] + + +# --------------------------------------------------------------------- refusal + + +def test_describe_all_handles_refusal_as_decorative_empty(tmp_path: Path): + """SDK `stop_reason="refusal"` must transition every photo to decorative+empty. + + The Anthropic safety policy refuses identifiable faces / NSFW; the + rubric says "if undescribable, mark decorative with empty + description". The orchestrator applies that contract at the SDK + level too so the batch makes progress instead of churning forever. + """ + media_root = tmp_path / "media" + item = _item( + "1", + [_downloaded(item_id="1", index=i, media_root=media_root) for i in range(5)], + ) + store = {"1": item} + client = _FakeVisionClient([_FakeRefusalResponse()]) + report = describe_all( + store, + media_root, + model="m", + output_language="English", + description_version="v1", + batch_size=5, + client=client, + ) + # All five photos must be Described as decorative + empty. + for media_entry in item.media: + assert isinstance(media_entry, MediaPhotoDescribed) + assert media_entry.is_decorative is True + assert media_entry.description == "" + # Refusal handling counts as described, not failed — the photo made + # progress out of Downloaded. + assert report.photos_described == 5 + assert report.photos_failed == 0 + + +def test_describe_all_logs_stop_reason_on_json_decode_failure(tmp_path: Path, caplog): + """A `max_tokens` truncation must surface stop_reason in the warning log. + + Otherwise the operator gets only "Expecting value: line N column M" + and cannot tell whether the cause is a truncated response, a + refusal, or a malformed model emission. + """ + media_root = tmp_path / "media" + item = _item("1", [_downloaded(item_id="1", index=0, media_root=media_root)]) + store = {"1": item} + client = _FakeVisionClient([_FakeTruncatedResponse()]) + with caplog.at_level("WARNING", logger="xbrain.describe"): + # Total failure raises (1 batch attempted, 0 described). + with pytest.raises(RuntimeError): + describe_all( + store, + media_root, + model="m", + output_language="English", + description_version="v1", + batch_size=1, + client=client, + ) + messages = "\n".join(record.getMessage() for record in caplog.records) + assert "stop_reason='max_tokens'" in messages or "stop_reason=" in messages + + +# --------------------------------------------------------------------- fence variants + + +def test_parse_batch_response_strips_fence_without_language_tag(): + """Some models emit ``` ... ``` (no `json` after the opening fence).""" + import json + + class _Fenced: + content = [type("B", (), {"type": "text", "text": ""})()] + + fenced_text = "```\n" + json.dumps([_judgment(0)]) + "\n```" + _Fenced.content[0].text = fenced_text + out = _parse_batch_response(_Fenced(), batch_size=1) + assert out[0]["index"] == 0 + + +# --------------------------------------------------------------------- partial-data batch + + +def test_describe_all_partial_judgments_count_as_batch_failed(tmp_path: Path): + """A batch with missing judgments must bump batches_failed (not 0). + + The vision model returned 1 of 2 expected judgments. One photo + transitions to Described, the other stays Downloaded, and the + batch counts as a failure because it did NOT return complete + data for the API call we made. + """ + media_root = tmp_path / "media" + item = _item( + "1", + [_downloaded(item_id="1", index=i, media_root=media_root) for i in range(2)], + ) + store = {"1": item} + # Only one judgment for a 2-image batch. + client = _FakeVisionClient([[_judgment(0, description="ok")]]) + report = describe_all( + store, + media_root, + model="m", + output_language="English", + description_version="v1", + batch_size=2, + client=client, + ) + assert report.photos_described == 1 + assert report.photos_failed == 1 + assert report.batches_failed == 1 + + # --------------------------------------------------------------------- summary line From a373eff8814496655686228723fc9b5b0a5b59b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Victor=20Gonz=C3=A1lez-Pacheco?= Date: Sun, 24 May 2026 20:40:12 +0200 Subject: [PATCH 10/11] [#34] inline _user_directive_text into _build_user_blocks Single-caller helper merged into its only caller. Removes the last surviving gold-plating flagged by code-simplifier in round 1 of the Phase B review pipeline. --- src/xbrain/describe.py | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/src/xbrain/describe.py b/src/xbrain/describe.py index fd7d675..6bf6e36 100644 --- a/src/xbrain/describe.py +++ b/src/xbrain/describe.py @@ -652,22 +652,6 @@ def describe_all( return report -def _user_directive_text(batch_size: int) -> dict: - """Build the trailing text block for the user turn — index-range directive. - - Tells the model the index range so the JSON list it emits is - contractually one-to-one with the input order. Indices are - zero-based to match Python and the rubric's example. - """ - return { - "type": "text", - "text": ( - f"Describe images 0 through {batch_size - 1}. " - "Return a JSON list with one entry per image, in the order received." - ), - } - - def _run_one_batch( *, batch: list[_Candidate], @@ -773,7 +757,18 @@ def _build_user_blocks(batch: list[_Candidate], media_root: Path) -> list[dict]: for candidate in batch: data = _load_bytes(media_root, candidate.entry.local_path) blocks.append(_build_image_block(data, _media_type(candidate.entry.local_path))) - blocks.append(_user_directive_text(len(batch))) + # Trailing text directive: tell the model the index range so its JSON + # list is contractually one-to-one with the input order. Indices are + # zero-based to match Python and the rubric's example. + blocks.append( + { + "type": "text", + "text": ( + f"Describe images 0 through {len(batch) - 1}. " + "Return a JSON list with one entry per image, in the order received." + ), + } + ) return blocks From 4c09ffffc3dd8586acb7613fd1392de0dd3fd4b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Victor=20Gonz=C3=A1lez-Pacheco?= Date: Sun, 24 May 2026 20:41:34 +0200 Subject: [PATCH 11/11] [#34] add negative tests for MediaPhotoDescribed invariants + media_type warning Closes 3 flags from the verify round of the Phase B review pipeline: - test_models.py: 3 negative tests pin the model-level invariants on MediaPhotoDescribed (UTC-aware described_at, SUPPORTED_LANGUAGES membership for description_lang, decorative => empty description). - test_describe.py: 1 unit test pins _media_type's unknown-extension fallback to image/jpeg AND the operator-visible logger.warning, so a silent regression on either side is caught. Test count: 502 -> 506. Quality gate green. --- tests/test_describe.py | 19 +++++++++ tests/test_models.py | 87 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 106 insertions(+) diff --git a/tests/test_describe.py b/tests/test_describe.py index d6aeaae..f5b44eb 100644 --- a/tests/test_describe.py +++ b/tests/test_describe.py @@ -1026,6 +1026,25 @@ def test_describe_all_sends_payload_with_expected_kwarg_shape(tmp_path: Path): assert "Describe images 0 through 1" in text_block["text"] +def test_media_type_warns_on_unknown_extension(caplog): + """Unknown extensions fall back to image/jpeg AND emit a `logger.warning`. + + The downloader only writes `.jpg`/`.jpeg`/`.png`/`.webp`, so an unknown + suffix means either a hand-edited `items.json` or a future format the + code does not register. The warning is the operator's only signal that + the wrong MIME type was sent to Anthropic — pin it so a silent fallback + cannot regress. + """ + import logging + + from xbrain.describe import _media_type + + with caplog.at_level(logging.WARNING, logger="xbrain.describe"): + result = _media_type("123/0.gif") + assert result == "image/jpeg" + assert any("unknown extension" in rec.message.lower() for rec in caplog.records) + + # --------------------------------------------------------------------- refusal diff --git a/tests/test_models.py b/tests/test_models.py index 8df67c9..7a80397 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -408,6 +408,93 @@ def test_media_photo_described_rejects_parent_traversal_local_path(): ) +def test_media_photo_described_rejects_naive_described_at(): + """`described_at` must be timezone-aware (UTC); a naive datetime is rejected. + + The UTC-aware invariant is enforced by a field validator so a + hand-edited `items.json` entry cannot smuggle a local-time datetime + past the type boundary. Naive timestamps cause downstream UTC math + (eligibility checks, sort orders) to drift silently. + """ + import pytest + from pydantic import ValidationError + + from xbrain.models import MediaPhotoDescribed + + with pytest.raises(ValidationError): + MediaPhotoDescribed( + url="https://pbs.twimg.com/media/X.jpg", + local_path="123/0.jpg", + width=10, + height=10, + bytes_size=100, + downloaded_at=datetime(2026, 5, 24, tzinfo=timezone.utc), + is_decorative=False, + description="hello", + description_lang="English", + description_version="v1", + described_at=datetime(2026, 5, 24), # naive — must fail + ) + + +def test_media_photo_described_rejects_unsupported_description_lang(): + """`description_lang` must be in `SUPPORTED_LANGUAGES`; others are rejected. + + The type alias is derived from `i18n.SUPPORTED_LANGUAGES` so the + `Literal[...]` validator rejects unknown languages at construction. + Prevents an out-of-band language tag from polluting the vault. + """ + import pytest + from pydantic import ValidationError + + from xbrain.models import MediaPhotoDescribed + + with pytest.raises(ValidationError): + MediaPhotoDescribed( + url="https://pbs.twimg.com/media/X.jpg", + local_path="123/0.jpg", + width=10, + height=10, + bytes_size=100, + downloaded_at=datetime(2026, 5, 24, tzinfo=timezone.utc), + is_decorative=False, + description="hello", + description_lang="Klingon", # not in SUPPORTED_LANGUAGES + description_version="v1", + described_at=datetime(2026, 5, 24, tzinfo=timezone.utc), + ) + + +def test_media_photo_described_rejects_decorative_with_nonempty_description(): + """`is_decorative=True` implies `description == ""` — model-validator enforces. + + Defence-in-depth for hand-edited records: the producer + (`describe._apply_judgment`) already blanks the description on + decorative judgments, but a hand-written entry that violates the + invariant must still be rejected at the type boundary so downstream + callers can rely on `is_decorative => not description` unconditionally. + """ + import pytest + from pydantic import ValidationError + + from xbrain.models import MediaPhotoDescribed + + with pytest.raises(ValidationError): + MediaPhotoDescribed( + url="https://pbs.twimg.com/media/X.jpg", + local_path="123/0.jpg", + width=10, + height=10, + bytes_size=100, + downloaded_at=datetime(2026, 5, 24, tzinfo=timezone.utc), + is_decorative=True, + description="should be empty when decorative", # violates invariant + description_lang="English", + description_version="v1", + described_at=datetime(2026, 5, 24, tzinfo=timezone.utc), + ) + + def test_media_discriminator_rejects_unknown_kind(): """Silently inventing a variant would mask data corruption — reject loudly.""" import pytest