From b6ec1a5232fd1b695b1ee15b941c4bdb1a177e4d Mon Sep 17 00:00:00 2001 From: Aaron Bockelie Date: Mon, 1 Jun 2026 18:48:28 -0500 Subject: [PATCH 1/2] feat(adr-102): add concept embedding-dimension check to offline validator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The offline validator checked that profile *references* resolve (index in range, cascade resolves, identity string shape) but never verified a concept's actual embedding vector length against its resolved profile's @dims. A backup could mis-tag a 768-dim concept as @1536 and pass clean — then mis-attach or fail at restore (esp. integration mode, which keys on embedding-space identity). Adds E_CONCEPT_EMBEDDING_DIM: for each concept carrying an embedding vector whose profile resolves with a parseable @dims, assert len(embedding) == dims (spec §3.2). Only fires when the vector is present and the profile resolves cleanly (bad index / malformed identity are flagged separately), so it never double-reports. - New _profile_dims() helper parses the @dims suffix from the resolved profile identity. - _validate_bulk now binds (was n_profiles only) to look up the identity. - selftest gains a pass case (3-dim vector under @3) and a fail case (2 != @3). Test fixtures fixed: test_id_remap / test_kg_backup_v2 / test_restore_modes declared @1536/@768 profiles but carried 1-2 element placeholder vectors — never dimension- honest. Switched them to synthetic identities (test:embed@1 / @2) matching their vector lengths. Real backups (e.g. a full 3994-concept nomic@768 export) validate clean. NOTE: lint_backup.py is now 830 lines (>800). A package split is contraindicated — it's loaded by pytest as a single standalone file via importlib spec_from_file_location (the Track-D no-deps oracle); splitting would break that. Flagging for a future task. 67 passed across the kg-backup/restore slice; selftest PASS. --- scripts/development/lint/lint_backup.py | 72 +++++++++++++++++++++++-- tests/unit/test_id_remap.py | 4 +- tests/unit/test_kg_backup_v2.py | 4 +- tests/unit/test_restore_modes.py | 6 +-- 4 files changed, 76 insertions(+), 10 deletions(-) diff --git a/scripts/development/lint/lint_backup.py b/scripts/development/lint/lint_backup.py index 042518ade..4da85de5a 100644 --- a/scripts/development/lint/lint_backup.py +++ b/scripts/development/lint/lint_backup.py @@ -111,6 +111,8 @@ "E_EPOCH_ACTOR_RANGE": "graph_epoch.actor index out of range.", # cascading default "E_NO_PROFILE_CASCADE": "concept resolves to no embedding-profile (cascade failed).", + # embedding vector integrity + "E_CONCEPT_EMBEDDING_DIM": "concept embedding length != resolved profile @dims.", # referential integrity "E_REL_FROM_MISSING": "relationship.from concept_id not in concepts[].", "E_REL_TO_MISSING": "relationship.to concept_id not in concepts[].", @@ -204,6 +206,28 @@ def _is_int_index(value: Any) -> bool: return isinstance(value, int) and not isinstance(value, bool) +def _profile_dims(profiles: List[Any], idx: Any) -> Optional[int]: + """Parse the ``@dims`` suffix of profile ``idx``'s identity (§3.2). + + Returns the integer dimension count from a ``{provider}:{model}@{dims}`` + identity, or ``None`` when ``idx`` is not a valid in-range profile or the + identity has no parseable ``@dims`` (those shapes are flagged separately by + ``E_CONCEPT_PROFILE_RANGE`` / ``E_PROFILE_IDENTITY``, so this just declines). + + @verified 32c0baea + """ + if not _is_int_index(idx) or not (0 <= idx < len(profiles)): + return None + prof = profiles[idx] + identity = prof.get("identity") if isinstance(prof, dict) else None + if not isinstance(identity, str) or "@" not in identity: + return None + try: + return int(identity.rsplit("@", 1)[1]) + except (ValueError, IndexError): + return None + + def validate_backup(obj: Dict[str, Any]) -> ValidationResult: """Validate a kg-backup object offline and return structured findings. @@ -364,7 +388,10 @@ def _validate_bulk( ) -> None: """Validate the bulk region: indices, integrity, epochs, dups, exclusions (§4-6). - @verified cffa180b + Includes the concept embedding-dimension check (§3.2): a concept's vector + length must equal its resolved profile's ``@dims``. + + @verified 32c0baea """ bulk = obj.get("bulk") if not isinstance(bulk, dict): @@ -385,7 +412,8 @@ def _validate_bulk( relationships = bulk.get("relationships") or [] graph_epochs = bulk.get("graph_epochs") # None => simple mode - n_profiles = len(header.get("embedding_profiles") or []) + profiles = header.get("embedding_profiles") or [] + n_profiles = len(profiles) n_rel_types = len(header.get("relationship_vocabulary") or []) n_content_types = len(header.get("content_types") or []) n_kinds = len(header.get("epoch_kinds") or []) @@ -423,6 +451,19 @@ def _validate_bulk( f"record→ontology→backup cascade (spec §4.1).", f"$.bulk.concepts[{i}]") + # embedding vector dimension must honor the resolved profile's @dims (§3.2). + # Only checked when a vector is present AND the profile resolves with parseable + # dims — bad index / malformed identity are already flagged above. + embedding = c.get("embedding") + if isinstance(embedding, list): + dims = _profile_dims(profiles, resolved) + if dims is not None and len(embedding) != dims: + ident = profiles[resolved].get("identity") + result.add(ERROR, "E_CONCEPT_EMBEDDING_DIM", + f"concept {cid!r} embedding has {len(embedding)} dims but " + f"resolved profile {resolved} ({ident!r}) declares {dims}.", + f"$.bulk.concepts[{i}].embedding") + # faithful mode: concepts should carry epoch stamps if faithful: if c.get("created_at_epoch") is None or c.get("last_seen_epoch") is None: @@ -624,7 +665,7 @@ def print_report(result: ValidationResult) -> None: def _selftest() -> int: """Run a minimal valid/invalid self-test. Returns process exit code. - @verified cffa180b + @verified 32c0baea """ valid = { "header": { @@ -696,6 +737,31 @@ def _selftest() -> int: missing = expected - codes assert not missing, f"invalid sample missing expected codes: {missing}" + # embedding dimension consistency (E_CONCEPT_EMBEDDING_DIM): a concept's vector + # length must equal its resolved profile's @dims. Uses a tiny @3 profile. + def _dim_obj(embedding): + return { + "header": { + "format_version": "kg-backup/2", + "source": {}, "exported_at": "x", "schema_version": 76, + "embedding_profiles": [{"identity": "test:model@3"}], + "default_embedding_profile": 0, + "relationship_vocabulary": [], "epoch_kinds": [], "actors": [], + "content_types": [], "ontologies": [], + }, + "bulk": { + "concepts": [{"concept_id": "c1", "embedding": embedding}], + "sources": [], "instances": [], "evidence": [], + "relationships": [], "vocabulary": [], + }, + } + r_dim_ok = validate_backup(_dim_obj([0.1, 0.2, 0.3])) # 3 == @3 + assert not any(i.code == "E_CONCEPT_EMBEDDING_DIM" for i in r_dim_ok.issues), \ + "matching embedding dims must not flag E_CONCEPT_EMBEDDING_DIM" + r_dim_bad = validate_backup(_dim_obj([0.1, 0.2])) # 2 != @3 + assert any(i.code == "E_CONCEPT_EMBEDDING_DIM" for i in r_dim_bad.issues), \ + "embedding length != profile @dims must flag E_CONCEPT_EMBEDDING_DIM" + # single-path: the removed legacy flat shape (no header) is refused legacy = {"version": "1.0", "type": "full", "data": {"concepts": [], "sources": [], "instances": [], "relationships": []}} diff --git a/tests/unit/test_id_remap.py b/tests/unit/test_id_remap.py index d40f10442..ce486c014 100644 --- a/tests/unit/test_id_remap.py +++ b/tests/unit/test_id_remap.py @@ -41,8 +41,8 @@ def _backup(**overrides): {"from": "c1", "to": "c2", "type": "IMPLIES", "properties": {"learned_id": "s1"}}, ], vocabulary=[{"relationship_type": "IMPLIES", "description": "", "category": "logical", - "embedding_model": "openai:text-embedding-3-small@1536"}], - embedding_profiles=[{"identity": "openai:text-embedding-3-small@1536", + "embedding_model": "test:embed@1"}], + embedding_profiles=[{"identity": "test:embed@1", "vector_space": "x", "image_vector_space": None, "name": "d", "multimodal": False}], epoch_kinds=[{"kind": "ingestion", "semantic_wallclock": True, "description": ""}], diff --git a/tests/unit/test_kg_backup_v2.py b/tests/unit/test_kg_backup_v2.py index b86a08a3f..d1fe0bdea 100644 --- a/tests/unit/test_kg_backup_v2.py +++ b/tests/unit/test_kg_backup_v2.py @@ -45,10 +45,10 @@ def _fixture_lists(): ], vocabulary=[ {"relationship_type": "IMPLIES", "description": "x", "category": "logical", - "embedding_model": "openai:text-embedding-3-small@1536"}, + "embedding_model": "test:embed@2"}, ], embedding_profiles=[ - {"identity": "openai:text-embedding-3-small@1536", + {"identity": "test:embed@2", "vector_space": "openai-3-small", "image_vector_space": None, "name": "default", "multimodal": False}, ], diff --git a/tests/unit/test_restore_modes.py b/tests/unit/test_restore_modes.py index d732c1b1e..52acb52b7 100644 --- a/tests/unit/test_restore_modes.py +++ b/tests/unit/test_restore_modes.py @@ -36,8 +36,8 @@ def _backup(): evidence=[{"concept_id": "c1", "instance_id": "i1"}, {"concept_id": "c2", "instance_id": "i1"}], relationships=[{"from": "c1", "to": "c2", "type": "IMPLIES", "properties": {"learned_id": "s1"}}], vocabulary=[{"relationship_type": "IMPLIES", "description": "", "category": "logical", - "embedding_model": "openai:text-embedding-3-small@1536"}], - embedding_profiles=[{"identity": "openai:text-embedding-3-small@1536", "vector_space": "x", + "embedding_model": "test:embed@2"}], + embedding_profiles=[{"identity": "test:embed@2", "vector_space": "x", "image_vector_space": None, "name": "d", "multimodal": False}], epoch_kinds=[{"kind": "ingestion", "semantic_wallclock": True, "description": ""}], graph_epochs=[{"event_id": 1, "occurred_at": "2026-06-01T00:00:00Z", "kind": "ingestion", @@ -46,7 +46,7 @@ def _backup(): ) -_BACKUP_IDENTITY = "openai:text-embedding-3-small@1536" +_BACKUP_IDENTITY = "test:embed@2" class _FakeMatcher: From 3634d2fdb633bfe213a2c5fdd4f5df0b5127f53c Mon Sep 17 00:00:00 2001 From: Aaron Bockelie Date: Mon, 1 Jun 2026 18:52:45 -0500 Subject: [PATCH 2/2] test(adr-102): selftest the no-double-report invariant for dim check Add a selftest case (review nit): an out-of-range record-level embedding_profile with a present embedding must yield E_CONCEPT_PROFILE_RANGE only, NOT also E_CONCEPT_EMBEDDING_DIM. Guards the decline-when-unresolved behavior against a future refactor that might move the dim check before the range/identity guards. --- scripts/development/lint/lint_backup.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/scripts/development/lint/lint_backup.py b/scripts/development/lint/lint_backup.py index 4da85de5a..f84262457 100644 --- a/scripts/development/lint/lint_backup.py +++ b/scripts/development/lint/lint_backup.py @@ -761,6 +761,15 @@ def _dim_obj(embedding): r_dim_bad = validate_backup(_dim_obj([0.1, 0.2])) # 2 != @3 assert any(i.code == "E_CONCEPT_EMBEDDING_DIM" for i in r_dim_bad.issues), \ "embedding length != profile @dims must flag E_CONCEPT_EMBEDDING_DIM" + # no double-report: an out-of-range profile + a present embedding yields the + # range error only — the dim check declines when the profile doesn't resolve. + obj_oor = _dim_obj([0.1, 0.2, 0.3]) + obj_oor["bulk"]["concepts"][0]["embedding_profile"] = 99 + r_oor = validate_backup(obj_oor) + assert any(i.code == "E_CONCEPT_PROFILE_RANGE" for i in r_oor.issues), \ + "out-of-range record profile must flag E_CONCEPT_PROFILE_RANGE" + assert not any(i.code == "E_CONCEPT_EMBEDDING_DIM" for i in r_oor.issues), \ + "out-of-range profile must NOT also trigger E_CONCEPT_EMBEDDING_DIM" # single-path: the removed legacy flat shape (no header) is refused legacy = {"version": "1.0", "type": "full", "data": {"concepts": [], "sources": [],