diff --git a/scripts/development/lint/lint_backup.py b/scripts/development/lint/lint_backup.py index 042518ade..f84262457 100644 --- a/scripts/development/lint/lint_backup.py +++ b/scripts/development/lint/lint_backup.py @@ -111,6 +111,8 @@ "E_EPOCH_ACTOR_RANGE": "graph_epoch.actor index out of range.", # cascading default "E_NO_PROFILE_CASCADE": "concept resolves to no embedding-profile (cascade failed).", + # embedding vector integrity + "E_CONCEPT_EMBEDDING_DIM": "concept embedding length != resolved profile @dims.", # referential integrity "E_REL_FROM_MISSING": "relationship.from concept_id not in concepts[].", "E_REL_TO_MISSING": "relationship.to concept_id not in concepts[].", @@ -204,6 +206,28 @@ def _is_int_index(value: Any) -> bool: return isinstance(value, int) and not isinstance(value, bool) +def _profile_dims(profiles: List[Any], idx: Any) -> Optional[int]: + """Parse the ``@dims`` suffix of profile ``idx``'s identity (§3.2). + + Returns the integer dimension count from a ``{provider}:{model}@{dims}`` + identity, or ``None`` when ``idx`` is not a valid in-range profile or the + identity has no parseable ``@dims`` (those shapes are flagged separately by + ``E_CONCEPT_PROFILE_RANGE`` / ``E_PROFILE_IDENTITY``, so this just declines). + + @verified 32c0baea + """ + if not _is_int_index(idx) or not (0 <= idx < len(profiles)): + return None + prof = profiles[idx] + identity = prof.get("identity") if isinstance(prof, dict) else None + if not isinstance(identity, str) or "@" not in identity: + return None + try: + return int(identity.rsplit("@", 1)[1]) + except (ValueError, IndexError): + return None + + def validate_backup(obj: Dict[str, Any]) -> ValidationResult: """Validate a kg-backup object offline and return structured findings. @@ -364,7 +388,10 @@ def _validate_bulk( ) -> None: """Validate the bulk region: indices, integrity, epochs, dups, exclusions (§4-6). - @verified cffa180b + Includes the concept embedding-dimension check (§3.2): a concept's vector + length must equal its resolved profile's ``@dims``. + + @verified 32c0baea """ bulk = obj.get("bulk") if not isinstance(bulk, dict): @@ -385,7 +412,8 @@ def _validate_bulk( relationships = bulk.get("relationships") or [] graph_epochs = bulk.get("graph_epochs") # None => simple mode - n_profiles = len(header.get("embedding_profiles") or []) + profiles = header.get("embedding_profiles") or [] + n_profiles = len(profiles) n_rel_types = len(header.get("relationship_vocabulary") or []) n_content_types = len(header.get("content_types") or []) n_kinds = len(header.get("epoch_kinds") or []) @@ -423,6 +451,19 @@ def _validate_bulk( f"record→ontology→backup cascade (spec §4.1).", f"$.bulk.concepts[{i}]") + # embedding vector dimension must honor the resolved profile's @dims (§3.2). + # Only checked when a vector is present AND the profile resolves with parseable + # dims — bad index / malformed identity are already flagged above. + embedding = c.get("embedding") + if isinstance(embedding, list): + dims = _profile_dims(profiles, resolved) + if dims is not None and len(embedding) != dims: + ident = profiles[resolved].get("identity") + result.add(ERROR, "E_CONCEPT_EMBEDDING_DIM", + f"concept {cid!r} embedding has {len(embedding)} dims but " + f"resolved profile {resolved} ({ident!r}) declares {dims}.", + f"$.bulk.concepts[{i}].embedding") + # faithful mode: concepts should carry epoch stamps if faithful: if c.get("created_at_epoch") is None or c.get("last_seen_epoch") is None: @@ -624,7 +665,7 @@ def print_report(result: ValidationResult) -> None: def _selftest() -> int: """Run a minimal valid/invalid self-test. Returns process exit code. - @verified cffa180b + @verified 32c0baea """ valid = { "header": { @@ -696,6 +737,40 @@ def _selftest() -> int: missing = expected - codes assert not missing, f"invalid sample missing expected codes: {missing}" + # embedding dimension consistency (E_CONCEPT_EMBEDDING_DIM): a concept's vector + # length must equal its resolved profile's @dims. Uses a tiny @3 profile. + def _dim_obj(embedding): + return { + "header": { + "format_version": "kg-backup/2", + "source": {}, "exported_at": "x", "schema_version": 76, + "embedding_profiles": [{"identity": "test:model@3"}], + "default_embedding_profile": 0, + "relationship_vocabulary": [], "epoch_kinds": [], "actors": [], + "content_types": [], "ontologies": [], + }, + "bulk": { + "concepts": [{"concept_id": "c1", "embedding": embedding}], + "sources": [], "instances": [], "evidence": [], + "relationships": [], "vocabulary": [], + }, + } + r_dim_ok = validate_backup(_dim_obj([0.1, 0.2, 0.3])) # 3 == @3 + assert not any(i.code == "E_CONCEPT_EMBEDDING_DIM" for i in r_dim_ok.issues), \ + "matching embedding dims must not flag E_CONCEPT_EMBEDDING_DIM" + r_dim_bad = validate_backup(_dim_obj([0.1, 0.2])) # 2 != @3 + assert any(i.code == "E_CONCEPT_EMBEDDING_DIM" for i in r_dim_bad.issues), \ + "embedding length != profile @dims must flag E_CONCEPT_EMBEDDING_DIM" + # no double-report: an out-of-range profile + a present embedding yields the + # range error only — the dim check declines when the profile doesn't resolve. + obj_oor = _dim_obj([0.1, 0.2, 0.3]) + obj_oor["bulk"]["concepts"][0]["embedding_profile"] = 99 + r_oor = validate_backup(obj_oor) + assert any(i.code == "E_CONCEPT_PROFILE_RANGE" for i in r_oor.issues), \ + "out-of-range record profile must flag E_CONCEPT_PROFILE_RANGE" + assert not any(i.code == "E_CONCEPT_EMBEDDING_DIM" for i in r_oor.issues), \ + "out-of-range profile must NOT also trigger E_CONCEPT_EMBEDDING_DIM" + # single-path: the removed legacy flat shape (no header) is refused legacy = {"version": "1.0", "type": "full", "data": {"concepts": [], "sources": [], "instances": [], "relationships": []}} diff --git a/tests/unit/test_id_remap.py b/tests/unit/test_id_remap.py index d40f10442..ce486c014 100644 --- a/tests/unit/test_id_remap.py +++ b/tests/unit/test_id_remap.py @@ -41,8 +41,8 @@ def _backup(**overrides): {"from": "c1", "to": "c2", "type": "IMPLIES", "properties": {"learned_id": "s1"}}, ], vocabulary=[{"relationship_type": "IMPLIES", "description": "", "category": "logical", - "embedding_model": "openai:text-embedding-3-small@1536"}], - embedding_profiles=[{"identity": "openai:text-embedding-3-small@1536", + "embedding_model": "test:embed@1"}], + embedding_profiles=[{"identity": "test:embed@1", "vector_space": "x", "image_vector_space": None, "name": "d", "multimodal": False}], epoch_kinds=[{"kind": "ingestion", "semantic_wallclock": True, "description": ""}], diff --git a/tests/unit/test_kg_backup_v2.py b/tests/unit/test_kg_backup_v2.py index b86a08a3f..d1fe0bdea 100644 --- a/tests/unit/test_kg_backup_v2.py +++ b/tests/unit/test_kg_backup_v2.py @@ -45,10 +45,10 @@ def _fixture_lists(): ], vocabulary=[ {"relationship_type": "IMPLIES", "description": "x", "category": "logical", - "embedding_model": "openai:text-embedding-3-small@1536"}, + "embedding_model": "test:embed@2"}, ], embedding_profiles=[ - {"identity": "openai:text-embedding-3-small@1536", + {"identity": "test:embed@2", "vector_space": "openai-3-small", "image_vector_space": None, "name": "default", "multimodal": False}, ], diff --git a/tests/unit/test_restore_modes.py b/tests/unit/test_restore_modes.py index d732c1b1e..52acb52b7 100644 --- a/tests/unit/test_restore_modes.py +++ b/tests/unit/test_restore_modes.py @@ -36,8 +36,8 @@ def _backup(): evidence=[{"concept_id": "c1", "instance_id": "i1"}, {"concept_id": "c2", "instance_id": "i1"}], relationships=[{"from": "c1", "to": "c2", "type": "IMPLIES", "properties": {"learned_id": "s1"}}], vocabulary=[{"relationship_type": "IMPLIES", "description": "", "category": "logical", - "embedding_model": "openai:text-embedding-3-small@1536"}], - embedding_profiles=[{"identity": "openai:text-embedding-3-small@1536", "vector_space": "x", + "embedding_model": "test:embed@2"}], + embedding_profiles=[{"identity": "test:embed@2", "vector_space": "x", "image_vector_space": None, "name": "d", "multimodal": False}], epoch_kinds=[{"kind": "ingestion", "semantic_wallclock": True, "description": ""}], graph_epochs=[{"event_id": 1, "occurred_at": "2026-06-01T00:00:00Z", "kind": "ingestion", @@ -46,7 +46,7 @@ def _backup(): ) -_BACKUP_IDENTITY = "openai:text-embedding-3-small@1536" +_BACKUP_IDENTITY = "test:embed@2" class _FakeMatcher: