From 13b28f9f94da196fc0c60e81415b77dafd4d1c11 Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Fri, 29 May 2026 01:14:46 -0500 Subject: [PATCH 1/6] add manifest verifier Signed-off-by: Nelson Spence --- .github/dependabot.yml | 6 +- .github/workflows/ci.yml | 27 + CHANGELOG.md | 10 + Cargo.lock | 638 ++++++++++++++++- Cargo.toml | 11 +- README.md | 12 +- THREAT_MODEL.md | 19 +- docs/INDEX_PROVENANCE.md | 36 +- ordvec-manifest/Cargo.toml | 35 + ordvec-manifest/README.md | 22 + ordvec-manifest/src/lib.rs | 1105 +++++++++++++++++++++++++++++ ordvec-manifest/src/main.rs | 312 ++++++++ ordvec-manifest/src/sqlite.rs | 133 ++++ ordvec-manifest/tests/manifest.rs | 458 ++++++++++++ src/lib.rs | 1 + src/rank_io.rs | 397 ++++++++++- 16 files changed, 3193 insertions(+), 29 deletions(-) create mode 100644 ordvec-manifest/Cargo.toml create mode 100644 ordvec-manifest/README.md create mode 100644 ordvec-manifest/src/lib.rs create mode 100644 ordvec-manifest/src/main.rs create mode 100644 ordvec-manifest/src/sqlite.rs create mode 100644 ordvec-manifest/tests/manifest.rs diff --git a/.github/dependabot.yml b/.github/dependabot.yml index ee6bf5e..2aa6b8e 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -19,9 +19,9 @@ updates: patterns: - "*" - # Cargo workspace root: core `ordvec` + the `ordvec-python` binding member - # (single workspace Cargo.lock). Group minor+patch into one PR; majors stay - # separate for manual review. + # Cargo workspace root: core `ordvec`, C ABI, Python binding, and repo-local + # manifest verifier (single workspace Cargo.lock). Group minor+patch into one + # PR; majors stay separate for manual review. - package-ecosystem: "cargo" directory: "/" schedule: diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index bc78c65..50b25ec 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -94,6 +94,33 @@ jobs: - name: cargo clippy -p ordvec-ffi run: cargo clippy -p ordvec-ffi --all-targets -- -D warnings + # ---------------------------------------------------------------------- + # Repo-local manifest verifier. Kept out of default-members so the core + # crate's default no-system-deps contract stays unchanged; this explicit lane + # covers its no-default-features surface, optional SQLite support, and clippy. + # ---------------------------------------------------------------------- + manifest: + name: manifest verifier + runs-on: ubuntu-latest + steps: + - uses: step-security/harden-runner@9af89fc71515a100421586dfdb3dc9c984fbf411 # v2.19.4 + with: + egress-policy: audit + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + - uses: dtolnay/rust-toolchain@29eef336d9b2848a0b548edc03f92a220660cdb8 # stable (2026-03-27) + with: + toolchain: stable + components: clippy + - uses: Swatinem/rust-cache@e18b497796c12c097a38f9edb9d0641fb99eee32 # v2 + - name: cargo test -p ordvec-manifest --no-default-features + run: cargo test -p ordvec-manifest --no-default-features + - name: cargo test -p ordvec-manifest --all-features + run: cargo test -p ordvec-manifest --all-features + - name: cargo clippy -p ordvec-manifest + run: cargo clippy -p ordvec-manifest --all-targets --all-features -- -D warnings + # ---------------------------------------------------------------------- # Repo-local Go wrapper over the C ABI. Build the release static library # first because the cgo wrapper links the source-tree archive directly. diff --git a/CHANGELOG.md b/CHANGELOG.md index 6dec955..9845b31 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,11 +7,21 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added + +- Added `probe_index_metadata` to inspect persisted `Rank`, `RankQuant`, + `Bitmap`, and `SignBitmap` headers without allocating payloads. +- Added the repo-local, publish=false `ordvec-manifest` crate with a strict v1 + JSON schema, artifact and row-identity verification, attestation shape + checks, a CLI, and optional SQLite registry/cache support. + ### Documentation - Reframed bitmap-overlap docs around the checked Lean proof spine: query symmetry, quotient sufficiency, finite threshold optimality, and idealized hypergeometric calibration, while preserving the real-encoder caveats. +- Documented sidecar manifest verification as a pre-load provenance check that + does not sign, manage keys, call networks, or decide trust policy. ## [0.2.0] - 2026-05-26 diff --git a/Cargo.lock b/Cargo.lock index 59e6f60..15bfc0e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,65 @@ # It is not intended for manual editing. version = 4 +[[package]] +name = "android_system_properties" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" +dependencies = [ + "libc", +] + +[[package]] +name = "anstream" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "824a212faf96e9acacdbd09febd34438f8f711fb84e09a8916013cd7815ca28d" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000" + +[[package]] +name = "anstyle-parse" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52ce7f38b242319f7cabaa6813055467063ecdc9d355bbb4ce0c68908cd8130e" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" +dependencies = [ + "windows-sys", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" +dependencies = [ + "anstyle", + "once_cell_polyfill", + "windows-sys", +] + [[package]] name = "anyhow" version = "1.0.102" @@ -20,6 +79,31 @@ version = "2.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c4512299f36f043ab09a583e57bceb5a5aab7a73db1805848e8fef3c9e8c78b3" +[[package]] +name = "block-buffer" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" +dependencies = [ + "generic-array", +] + +[[package]] +name = "bumpalo" +version = "3.20.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72f5acc6cb2ba439de613abc23857ec3d78374d8ed5ac84e9d11336e87da8649" + +[[package]] +name = "cc" +version = "1.2.62" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1dce859f0832a7d088c4f1119888ab94ef4b5d6795d1ce05afb7fe159d79f98" +dependencies = [ + "find-msvc-tools", + "shlex", +] + [[package]] name = "cfg-if" version = "1.0.4" @@ -33,10 +117,82 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6f8d983286843e49675a4b7a2d174efe136dc93a18d69130dd18198a6c167601" dependencies = [ "cfg-if", - "cpufeatures", + "cpufeatures 0.3.0", "rand_core", ] +[[package]] +name = "chrono" +version = "0.4.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c673075a2e0e5f4a1dde27ce9dee1ea4558c7ffe648f576438a20ca1d2acc4b0" +dependencies = [ + "iana-time-zone", + "num-traits", + "windows-link", +] + +[[package]] +name = "clap" +version = "4.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ddb117e43bbf7dacf0a4190fef4d345b9bad68dfc649cb349e7d17d28428e51" +dependencies = [ + "clap_builder", + "clap_derive", +] + +[[package]] +name = "clap_builder" +version = "4.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "714a53001bf66416adb0e2ef5ac857140e7dc3a0c48fb28b2f10762fc4b5069f" +dependencies = [ + "anstream", + "anstyle", + "clap_lex", + "strsim", +] + +[[package]] +name = "clap_derive" +version = "4.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2ce8604710f6733aa641a2b3731eaa1e8b3d9973d5e3565da11800813f997a9" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "clap_lex" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9" + +[[package]] +name = "colorchoice" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570" + +[[package]] +name = "core-foundation-sys" +version = "0.8.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" + +[[package]] +name = "cpufeatures" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280" +dependencies = [ + "libc", +] + [[package]] name = "cpufeatures" version = "0.3.0" @@ -71,6 +227,26 @@ version = "0.8.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" +[[package]] +name = "crypto-common" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a" +dependencies = [ + "generic-array", + "typenum", +] + +[[package]] +name = "digest" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" +dependencies = [ + "block-buffer", + "crypto-common", +] + [[package]] name = "either" version = "1.16.0" @@ -83,12 +259,86 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" +[[package]] +name = "errno" +version = "0.3.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" +dependencies = [ + "libc", + "windows-sys", +] + +[[package]] +name = "fallible-iterator" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2acce4a10f12dc2fb14a218589d4f1f62ef011b2d0cc4b3cb1bba8e94da14649" + +[[package]] +name = "fallible-streaming-iterator" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a" + +[[package]] +name = "fastrand" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6" + +[[package]] +name = "find-msvc-tools" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" + [[package]] name = "foldhash" version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" +[[package]] +name = "foldhash" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb" + +[[package]] +name = "futures-core" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e3450815272ef58cec6d564423f6e755e25379b217b0bc688e295ba24df6b1d" + +[[package]] +name = "futures-task" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "037711b3d59c33004d3856fbdc83b99d4ff37a24768fa1be9ce3538a1cde4393" + +[[package]] +name = "futures-util" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "389ca41296e6190b48053de0321d02a77f32f8a5d2461dd38762c0593805c6d6" +dependencies = [ + "futures-core", + "futures-task", + "pin-project-lite", + "slab", +] + +[[package]] +name = "generic-array" +version = "0.14.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" +dependencies = [ + "typenum", + "version_check", +] + [[package]] name = "getrandom" version = "0.4.2" @@ -109,7 +359,16 @@ version = "0.15.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" dependencies = [ - "foldhash", + "foldhash 0.1.5", +] + +[[package]] +name = "hashbrown" +version = "0.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" +dependencies = [ + "foldhash 0.2.0", ] [[package]] @@ -118,12 +377,51 @@ version = "0.17.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed5909b6e89a2db4456e54cd5f673791d7eca6732202bbf2a9cc504fe2f9b84a" +[[package]] +name = "hashlink" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea0b22561a9c04a7cb1a302c013e0259cd3b4bb619f145b32f72b8b4bcbed230" +dependencies = [ + "hashbrown 0.16.1", +] + [[package]] name = "heck" version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" +[[package]] +name = "hex" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" + +[[package]] +name = "iana-time-zone" +version = "0.1.65" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e31bc9ad994ba00e440a8aa5c9ef0ec67d5cb5e5cb0cc7f8b744a35b389cc470" +dependencies = [ + "android_system_properties", + "core-foundation-sys", + "iana-time-zone-haiku", + "js-sys", + "log", + "wasm-bindgen", + "windows-core", +] + +[[package]] +name = "iana-time-zone-haiku" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" +dependencies = [ + "cc", +] + [[package]] name = "id-arena" version = "2.3.0" @@ -151,12 +449,30 @@ dependencies = [ "rustversion", ] +[[package]] +name = "is_terminal_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" + [[package]] name = "itoa" version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" +[[package]] +name = "js-sys" +version = "0.3.99" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "142bc4740e452c1e57ade0cbc129f139c9093e354346f0872ef985f4f5cf5f11" +dependencies = [ + "cfg-if", + "futures-util", + "once_cell", + "wasm-bindgen", +] + [[package]] name = "leb128fmt" version = "0.1.0" @@ -169,6 +485,23 @@ version = "0.2.186" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66" +[[package]] +name = "libsqlite3-sys" +version = "0.37.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1f111c8c41e7c61a49cd34e44c7619462967221a6443b0ec299e0ac30cfb9b1" +dependencies = [ + "cc", + "pkg-config", + "vcpkg", +] + +[[package]] +name = "linux-raw-sys" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53" + [[package]] name = "log" version = "0.4.30" @@ -264,6 +597,12 @@ version = "1.21.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50" +[[package]] +name = "once_cell_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" + [[package]] name = "ordvec" version = "0.2.0" @@ -280,6 +619,22 @@ dependencies = [ "ordvec", ] +[[package]] +name = "ordvec-manifest" +version = "0.0.0" +dependencies = [ + "chrono", + "clap", + "hex", + "ordvec", + "rusqlite", + "serde", + "serde_json", + "sha2", + "tempfile", + "uuid", +] + [[package]] name = "ordvec-python" version = "0.2.0" @@ -289,6 +644,18 @@ dependencies = [ "pyo3", ] +[[package]] +name = "pin-project-lite" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd" + +[[package]] +name = "pkg-config" +version = "0.3.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19f132c84eca552bf34cab8ec81f1c1dcc229b811638f9d283dceabe58c5569e" + [[package]] name = "portable-atomic" version = "1.13.1" @@ -461,12 +828,50 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "rsqlite-vfs" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c51c9ae4df8a7fba42103df5c621fa3c37eccf3a3c650879e90fc48b11cc192c" +dependencies = [ + "hashbrown 0.16.1", + "thiserror", +] + +[[package]] +name = "rusqlite" +version = "0.39.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a0d2b0146dd9661bf67bb107c0bb2a55064d556eeb3fc314151b957f313bcd4e" +dependencies = [ + "bitflags", + "fallible-iterator", + "fallible-streaming-iterator", + "hashlink", + "libsqlite3-sys", + "smallvec", + "sqlite-wasm-rs", +] + [[package]] name = "rustc-hash" version = "2.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94300abf3f1ae2e2b8ffb7b58043de3d399c73fa6f4b73826402a5c457614dbe" +[[package]] +name = "rustix" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190" +dependencies = [ + "bitflags", + "errno", + "libc", + "linux-raw-sys", + "windows-sys", +] + [[package]] name = "rustversion" version = "1.0.22" @@ -486,6 +891,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" dependencies = [ "serde_core", + "serde_derive", ] [[package]] @@ -521,6 +927,53 @@ dependencies = [ "zmij", ] +[[package]] +name = "sha2" +version = "0.10.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" +dependencies = [ + "cfg-if", + "cpufeatures 0.2.17", + "digest", +] + +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + +[[package]] +name = "slab" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5" + +[[package]] +name = "smallvec" +version = "1.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" + +[[package]] +name = "sqlite-wasm-rs" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc3efc0da82635d7e1ced0053bbbfa8c7ab9645d0bf36ceb4f7127bb85315d75" +dependencies = [ + "cc", + "js-sys", + "rsqlite-vfs", + "wasm-bindgen", +] + +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + [[package]] name = "syn" version = "2.0.117" @@ -538,6 +991,45 @@ version = "0.13.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "adb6935a6f5c20170eeceb1a3835a49e12e19d792f6dd344ccc76a985ca5a6ca" +[[package]] +name = "tempfile" +version = "3.27.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd" +dependencies = [ + "fastrand", + "getrandom", + "once_cell", + "rustix", + "windows-sys", +] + +[[package]] +name = "thiserror" +version = "2.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "2.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "typenum" +version = "1.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40ce102ab67701b8526c123c1bab5cbe42d7040ccfd0f64af1a385808d2f43de" + [[package]] name = "unicode-ident" version = "1.0.24" @@ -556,6 +1048,35 @@ version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3" +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + +[[package]] +name = "uuid" +version = "1.23.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddd74a9687298c6858e9b88ec8935ec45d22e8fd5e6394fa1bd4e99a87789c76" +dependencies = [ + "getrandom", + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + [[package]] name = "wasip2" version = "1.0.3+wasi-0.2.9" @@ -574,6 +1095,51 @@ dependencies = [ "wit-bindgen 0.51.0", ] +[[package]] +name = "wasm-bindgen" +version = "0.2.122" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3ed04576f974d2b2fba0f38c51dbc5518011e38c36bf1143164be765528fd409" +dependencies = [ + "cfg-if", + "once_cell", + "rustversion", + "wasm-bindgen-macro", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.122" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "916151b09da36bd82f6615cbf3a419e2f0ba23a03c6160e8e92eb6bd4aa1dec6" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.122" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "299047362ccbfce148b67ab7e73349f77748e00c8296f9542adfad2ad82c5c5e" +dependencies = [ + "bumpalo", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.122" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a929b2c61f11ba3e9bc35b50c1f25cb38e0e892c0c231ae2b8cf78d5dad4437" +dependencies = [ + "unicode-ident", +] + [[package]] name = "wasm-encoder" version = "0.244.0" @@ -608,6 +1174,74 @@ dependencies = [ "semver", ] +[[package]] +name = "windows-core" +version = "0.62.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8e83a14d34d0623b51dce9581199302a221863196a1dde71a7663a4c2be9deb" +dependencies = [ + "windows-implement", + "windows-interface", + "windows-link", + "windows-result", + "windows-strings", +] + +[[package]] +name = "windows-implement" +version = "0.60.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "windows-interface" +version = "0.59.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "windows-link" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" + +[[package]] +name = "windows-result" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7781fa89eaf60850ac3d2da7af8e5242a5ea78d1a11c49bf2910bb5a73853eb5" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-strings" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7837d08f69c77cf6b07689544538e017c1bfcf57e34b4c0ff58e6c2cd3b37091" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-sys" +version = "0.61.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" +dependencies = [ + "windows-link", +] + [[package]] name = "wit-bindgen" version = "0.51.0" diff --git a/Cargo.toml b/Cargo.toml index cc393fb..b31d434 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -50,13 +50,14 @@ codegen-units = 1 opt-level = 3 # Workspace: the `ordvec-python` member holds the PyO3/maturin bindings shipped to -# PyPI as `ordvec`. It is `publish = false` and ships separately (never via -# crates.io). `default-members = ["."]` keeps bare `cargo build/test/clippy` scoped -# to the core crate, so the existing CI gates are unaffected; the bindings get their -# own CI job. The single workspace `Cargo.lock` gains pyo3/numpy + transitives. +# PyPI as `ordvec`; `ordvec-ffi` holds the C ABI; and `ordvec-manifest` is a +# repo-local, publish=false sidecar verifier. `default-members = ["."]` keeps +# bare `cargo build/test/clippy` scoped to the core crate, so the existing CI +# gates are unaffected; non-core members get explicit CI lanes. The single +# workspace `Cargo.lock` carries their transitive dependencies. [workspace] resolver = "2" -members = ["ordvec-python", "ordvec-ffi"] +members = ["ordvec-python", "ordvec-ffi", "ordvec-manifest"] default-members = ["."] # fuzz/ is a cargo-fuzz crate built only via `cargo +nightly fuzz`. Keep it out of # the workspace so it stays a standalone crate (its own Cargo.lock) and `cargo fuzz` diff --git a/README.md b/README.md index 0f39642..0a7c4d5 100644 --- a/README.md +++ b/README.md @@ -156,6 +156,8 @@ Wheels target CPython 3.10+ (abi3); to build from source instead, see - **Index-file trust model:** [`docs/INDEX_PROVENANCE.md`](docs/INDEX_PROVENANCE.md), [`THREAT_MODEL.md`](THREAT_MODEL.md) +- **Repo-local manifest verifier:** + [`ordvec-manifest/`](ordvec-manifest/) (`cargo run -p ordvec-manifest -- verify --manifest ...`) - **C ABI:** [`docs/c-api.md`](docs/c-api.md) - **Formal proof spine:** [`ordvec-formalization`](https://github.com/Fieldnote-Echo/ordvec-formalization), @@ -214,10 +216,12 @@ The on-disk formats (`.tvr` / `.tvrq` / `.tvbm` / `.tvsb`) carry **no built-in checksum, MAC, or signature — by design.** The loaders validate *structure* (magic, version, bounds, exact-length payload) but not *origin*: a structurally valid file can still be untrusted. If an index file crosses a -trust boundary (network transfer, shared storage), verifying it is the -caller's responsibility — e.g. a SHA-256 manifest, artifact-store integrity, -or Sigstore attestation. No in-format crypto is shipped because it would add -key management the library can't own. See +trust boundary (network transfer, shared storage), verify it before loading. +This repo includes a publish=false sidecar CLI, `ordvec-manifest`, that binds an +index file to a JSON manifest by SHA-256, header metadata, row identity, and +attestation shape checks. It does not sign artifacts, manage keys, or decide +deployment trust policy. No in-format crypto is shipped because it would add key +management the library can't own. See [`docs/INDEX_PROVENANCE.md`](docs/INDEX_PROVENANCE.md) and [`THREAT_MODEL.md`](THREAT_MODEL.md). diff --git a/THREAT_MODEL.md b/THREAT_MODEL.md index e5202aa..d5ea7aa 100644 --- a/THREAT_MODEL.md +++ b/THREAT_MODEL.md @@ -1,9 +1,10 @@ # Threat Model — `ordvec` > **Status:** v0.2.0 (pre-1.0), 2026-05-28. This is the maintained threat model -> for the `ordvec` Rust crate, C ABI, Go wrapper, and PyO3/maturin Python bindings. It -> is reviewed when the attack surface changes (new persistence formats, new -> `unsafe` kernels, new FFI surface, or release-pipeline changes). +> for the `ordvec` Rust crate, C ABI, Go wrapper, PyO3/maturin Python bindings, +> and the repo-local `ordvec-manifest` sidecar verifier. It is reviewed when the +> attack surface changes (new persistence formats, new `unsafe` kernels, new +> FFI surface, or release-pipeline changes). > > Scope discipline: `ordvec` is a **pure computational library** — no network > surface, no authentication/authorization, no secrets handling, no @@ -66,6 +67,7 @@ absence of a second maintainer is itself a tracked supply-chain residual | Layer | Components | Trust boundary | |---|---|---| | **Deserialization** | `rank_io.rs` — `.tvr` / `.tvrq` / `.tvbm` / `.tvsb` loaders | Untrusted filesystem / network byte stream | +| **Manifest verification** | `ordvec-manifest` — publish=false JSON sidecar verifier | Manifest + index + optional row-map files before load | | **Compute kernels** | `fastscan.rs`, `quant_kernels.rs`, `bitmap.rs`, `sign_bitmap.rs` | Trust established after format validation | | **Index API** | `rank.rs`, `quant.rs`, `bitmap.rs`, `sign_bitmap.rs` | Caller-controlled query embeddings | | **C ABI** | `ordvec-ffi` (`include/ordvec.h`) | C caller ↔ Rust boundary; raw pointers and opaque handles | @@ -142,10 +144,13 @@ problem, not a parser problem. *Mitigation (no format change):* [`docs/INDEX_PROVENANCE.md`](docs/INDEX_PROVENANCE.md) documents that `ordvec` validates structure, not origin, and lists verification options (checksum manifest, artifact-store integrity, Sigstore / GitHub artifact attestation) -for deployments where index files cross trust boundaries. An optional sidecar -verifier (HMAC / BLAKE3) can be added later without a format bump; it is -deliberately **not** shipped now (no concrete deployment requires it, and an -in-format crypto layer would add unowned key management). +for deployments where index files cross trust boundaries. The repo now includes +`ordvec-manifest`, a publish=false sidecar verifier that binds an index file to +JSON manifest metadata by SHA-256, allocation-resistant header probing, strict +row identity checks, and attestation shape checks. It deliberately does **not** +sign, manage keys, call networks, mutate index files, change the C ABI, or +decide trust policy; an in-format crypto layer is still not shipped because it +would add unowned key management. --- diff --git a/docs/INDEX_PROVENANCE.md b/docs/INDEX_PROVENANCE.md index ec8d3fc..4dc2c3f 100644 --- a/docs/INDEX_PROVENANCE.md +++ b/docs/INDEX_PROVENANCE.md @@ -40,7 +40,31 @@ The loaders validate **structure, not origin or truth**: If you load index files that were produced elsewhere, transferred over a network, or stored on shared/mutable infrastructure, verify them **before** -loading using whatever your deployment already trusts: +loading. The repo-local `ordvec-manifest` crate provides a sidecar verifier for +that pre-load step: + +```sh +cargo run -p ordvec-manifest -- verify --manifest path/to/index.manifest.json +``` + +The manifest verifier checks: + +- the index bytes against the manifest's SHA-256 digest; +- the fixed index header metadata (`Rank`, `RankQuant`, `Bitmap`, or + `SignBitmap`) without allocating the payload; +- declared dimension, vector count, bytes-per-vector, format version, and + format parameters against the probed metadata; +- row identity, either explicit `row_id_identity` or a strict JSONL row map + whose `row_id` equals the zero-based line number and whose `db_id` is + non-empty, NUL-free, and unique by default; +- attestation **shape** only: predicate type, builder id when present, and at + least one subject SHA-256 matching the artifact when attestations are + supplied. + +Recipes that consume sidecar manifests should run the verifier first, then +load/search/rerank only if verification succeeds. + +You can also verify using whatever your deployment already trusts: - a checksum manifest (e.g. SHA-256) recorded by the build that produced the index, verified at load time; @@ -48,8 +72,8 @@ loading using whatever your deployment already trusts: - a signature / attestation layer (e.g. Sigstore, GitHub artifact attestations) over the index files. -`ordvec` deliberately ships **no** built-in signing/MAC layer today: without a -concrete deployment requiring it, an in-format crypto layer would add key -management with no clear owner. A sidecar verifier (e.g. an `ordvec verify` -utility, or an external HMAC/BLAKE3 manifest) can be added later **without a -file-format change** if a real deployment needs tamper-evidence. +`ordvec-manifest` is not a trust oracle. It does **not** sign, manage keys, +call networks, mutate index files, change the C ABI, or decide whether a +builder or signer is trusted. `ordvec` deliberately ships **no** built-in +signing/MAC layer today: without a concrete deployment requiring it, an +in-format crypto layer would add key management with no clear owner. diff --git a/ordvec-manifest/Cargo.toml b/ordvec-manifest/Cargo.toml new file mode 100644 index 0000000..e83916d --- /dev/null +++ b/ordvec-manifest/Cargo.toml @@ -0,0 +1,35 @@ +[package] +name = "ordvec-manifest" +version = "0.0.0" +edition = "2021" +rust-version = "1.89" +publish = false +license = "MIT OR Apache-2.0" +description = "Repo-local ordvec index manifest verifier" + +[lib] +name = "ordvec_manifest" +path = "src/lib.rs" + +[[bin]] +name = "ordvec-manifest" +path = "src/main.rs" + +[dependencies] +chrono = { version = "0.4.44", default-features = false, features = ["clock", "std"] } +clap = { version = "4.6.1", features = ["derive"] } +hex = "0.4.3" +ordvec = { path = ".." } +rusqlite = { version = "0.39.0", optional = true } +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0" +sha2 = "0.10.9" +uuid = { version = "1.23.0", features = ["v4"] } + +[dev-dependencies] +tempfile = "3.27.0" + +[features] +default = [] +sqlite = ["dep:rusqlite"] +sqlite-bundled = ["sqlite", "rusqlite/bundled"] diff --git a/ordvec-manifest/README.md b/ordvec-manifest/README.md new file mode 100644 index 0000000..b333cac --- /dev/null +++ b/ordvec-manifest/README.md @@ -0,0 +1,22 @@ +# ordvec-manifest + +Repo-local, publish=false sidecar verifier for ordvec index manifests. + +It verifies index bytes, probed header metadata, row identity, and attestation +shape before a caller loads an ordvec index. It does not sign artifacts, manage +keys, call networks, mutate index files, decide deployment trust policy, or +change the C ABI. + +```sh +cargo run -p ordvec-manifest -- create \ + --index path/to/index.tvrq \ + --row-id-is-identity \ + --embedding-model bge-small-en-v1.5 \ + --out path/to/index.manifest.json + +cargo run -p ordvec-manifest -- verify --manifest path/to/index.manifest.json +``` + +The schema version is `ordvec.index_manifest.v1`. Relative paths resolve from +the manifest file's directory, absolute paths are rejected by default, and +relative paths may not escape the manifest directory unless explicitly allowed. diff --git a/ordvec-manifest/src/lib.rs b/ordvec-manifest/src/lib.rs new file mode 100644 index 0000000..b6d9263 --- /dev/null +++ b/ordvec-manifest/src/lib.rs @@ -0,0 +1,1105 @@ +use chrono::{DateTime, SecondsFormat, Utc}; +use ordvec::{ + probe_index_metadata, IndexKind as CoreIndexKind, IndexMetadata as CoreIndexMetadata, + IndexParams as CoreIndexParams, +}; +use serde::{Deserialize, Serialize}; +use sha2::{Digest, Sha256}; +use std::collections::{BTreeMap, HashSet}; +use std::fmt; +use std::fs::{self, File}; +use std::io::{self, BufRead, BufReader, Read}; +use std::path::{Component, Path, PathBuf}; +use uuid::Uuid; + +pub const SCHEMA_VERSION: &str = "ordvec.index_manifest.v1"; + +#[derive(Debug)] +pub enum ManifestError { + Io(io::Error), + Json(serde_json::Error), + Invalid(String), +} + +impl ManifestError { + pub fn invalid(message: impl Into) -> Self { + Self::Invalid(message.into()) + } +} + +impl fmt::Display for ManifestError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::Io(err) => write!(f, "{err}"), + Self::Json(err) => write!(f, "{err}"), + Self::Invalid(message) => f.write_str(message), + } + } +} + +impl std::error::Error for ManifestError {} + +impl From for ManifestError { + fn from(value: io::Error) -> Self { + Self::Io(value) + } +} + +impl From for ManifestError { + fn from(value: serde_json::Error) -> Self { + Self::Json(value) + } +} + +#[derive(Clone, Debug)] +pub struct ManifestDocument { + pub manifest: IndexManifest, + pub source_path: Option, + pub base_dir: PathBuf, +} + +pub fn load_manifest_file(path: impl AsRef) -> Result { + let path = path.as_ref(); + let file = File::open(path)?; + let manifest: IndexManifest = serde_json::from_reader(file)?; + let base_dir = path + .parent() + .filter(|p| !p.as_os_str().is_empty()) + .unwrap_or_else(|| Path::new(".")) + .to_path_buf(); + Ok(ManifestDocument { + manifest, + source_path: Some(path.to_path_buf()), + base_dir, + }) +} + +pub fn verify_manifest_with_base( + manifest: IndexManifest, + base_dir: impl Into, + options: VerifyOptions, +) -> VerificationReport { + let document = ManifestDocument { + manifest, + source_path: None, + base_dir: base_dir.into(), + }; + verify_manifest(&document, options) +} + +pub fn verify_manifest(document: &ManifestDocument, options: VerifyOptions) -> VerificationReport { + let mut report = VerificationReport::new(Some(document.manifest.manifest_id.clone())); + validate_manifest_shape(&document.manifest, &mut report); + + let artifact_display_path = document.manifest.artifact.path.clone(); + report.artifact.manifest_path = Some(artifact_display_path.clone()); + let artifact_path = options + .index_override + .as_ref() + .cloned() + .unwrap_or_else(|| PathBuf::from(&document.manifest.artifact.path)); + report.artifact.observed_path = Some(path_to_display(&artifact_path)); + + if let Some(resolved) = resolve_existing_path( + &artifact_path, + &document.base_dir, + &options, + "artifact", + &mut report.errors, + ) { + report.artifact.canonical_path = Some(path_to_display(&resolved.canonical_path)); + match sha256_file(&resolved.resolved_path) { + Ok(hash) => { + report.artifact.sha256 = Some(hash.sha256.clone()); + report.artifact.size_bytes = Some(hash.size_bytes); + if !hex_digest_eq(&hash.sha256, &document.manifest.artifact.sha256) { + report.error( + "artifact_sha256_mismatch", + format!( + "artifact SHA-256 was {}, manifest declares {}", + hash.sha256, document.manifest.artifact.sha256 + ), + ); + } + if hash.size_bytes != document.manifest.artifact.file_size_bytes { + report.error( + "artifact_file_size_mismatch", + format!( + "artifact size was {}, manifest declares {}", + hash.size_bytes, document.manifest.artifact.file_size_bytes + ), + ); + } + } + Err(err) => report.error( + "artifact_hash_failed", + format!("failed to hash artifact: {err}"), + ), + } + + match probe_index_metadata(&resolved.resolved_path) { + Ok(metadata) => { + let metadata_report = MetadataReport::from_core(&metadata); + compare_artifact_metadata(&document.manifest.artifact, &metadata, &mut report); + report.artifact.metadata = Some(metadata_report); + } + Err(err) => report.error( + "artifact_probe_failed", + format!("failed to probe artifact metadata: {err}"), + ), + } + } + + verify_row_identity(document, &options, &mut report); + verify_attestations(&document.manifest, &mut report); + + report.ok = report.errors.is_empty(); + report +} + +fn validate_manifest_shape(manifest: &IndexManifest, report: &mut VerificationReport) { + if manifest.schema_version != SCHEMA_VERSION { + report.error( + "schema_version_unsupported", + format!( + "schema_version must be {SCHEMA_VERSION}, got {}", + manifest.schema_version + ), + ); + } + if manifest.manifest_id.trim().is_empty() { + report.error("manifest_id_empty", "manifest_id must be non-empty"); + } + if DateTime::parse_from_rfc3339(&manifest.created_at).is_err() { + report.error("created_at_invalid", "created_at must parse as RFC3339"); + } + if manifest.embedding.model.trim().is_empty() { + report.error("embedding_model_empty", "embedding.model must be non-empty"); + } + if manifest.embedding.dim == 0 { + report.error( + "embedding_dim_zero", + "embedding.dim must be greater than zero", + ); + } + if manifest.artifact.path.trim().is_empty() { + report.error("artifact_path_empty", "artifact.path must be non-empty"); + } + if !is_sha256_hex(&manifest.artifact.sha256) { + report.error( + "artifact_sha256_invalid", + "artifact.sha256 must be a 64-character hex SHA-256 digest", + ); + } + if manifest.artifact.bytes_per_vec == 0 { + report.error( + "artifact_bytes_per_vec_zero", + "artifact.bytes_per_vec must be greater than zero", + ); + } + if manifest.artifact.dim != manifest.embedding.dim { + report.error( + "artifact_embedding_dim_mismatch", + format!( + "artifact.dim {} does not match embedding.dim {}", + manifest.artifact.dim, manifest.embedding.dim + ), + ); + } + if !artifact_kind_matches_params(manifest.artifact.kind, &manifest.artifact.params) { + report.error( + "artifact_params_kind_mismatch", + "artifact.params discriminator does not match artifact.kind", + ); + } + + let row_count = manifest.row_identity.row_count(); + if manifest.artifact.vector_count != row_count { + report.error( + "artifact_row_count_mismatch", + format!( + "artifact.vector_count {} does not match row_identity.row_count {}", + manifest.artifact.vector_count, row_count + ), + ); + } + if let RowIdentity::Jsonl { + path, + sha256, + id_kind, + .. + } = &manifest.row_identity + { + if path.trim().is_empty() { + report.error( + "row_identity_path_empty", + "row_identity.path must be non-empty", + ); + } + if !is_sha256_hex(sha256) { + report.error( + "row_identity_sha256_invalid", + "row_identity.sha256 must be a 64-character hex SHA-256 digest", + ); + } + if id_kind != "uuid" { + report.error( + "row_identity_id_kind_unsupported", + "row_identity.id_kind must be uuid in v1", + ); + } + } + + if let Some(build) = &manifest.build { + if build.invocation_id.trim().is_empty() { + report.error( + "build_invocation_id_empty", + "build.invocation_id must be non-empty", + ); + } + if build + .builder_id + .as_ref() + .is_some_and(|builder_id| builder_id.trim().is_empty()) + { + report.error( + "build_builder_id_empty", + "build.builder_id must be non-empty", + ); + } + } + + for key in manifest.extensions.keys() { + if !extension_key_is_namespaced(key) { + report.error( + "extension_key_not_namespaced", + format!("extension key {key:?} must be namespaced"), + ); + } + } +} + +fn artifact_kind_matches_params(kind: ManifestIndexKind, params: &ManifestIndexParams) -> bool { + matches!( + (kind, params), + (ManifestIndexKind::Rank, ManifestIndexParams::Rank) + | ( + ManifestIndexKind::RankQuant, + ManifestIndexParams::RankQuant { .. } + ) + | ( + ManifestIndexKind::Bitmap, + ManifestIndexParams::Bitmap { .. } + ) + | ( + ManifestIndexKind::SignBitmap, + ManifestIndexParams::SignBitmap + ) + ) +} + +fn compare_artifact_metadata( + artifact: &Artifact, + metadata: &CoreIndexMetadata, + report: &mut VerificationReport, +) { + let observed_kind = ManifestIndexKind::from_core(metadata.kind); + if artifact.kind != observed_kind { + report.error( + "artifact_kind_mismatch", + format!( + "artifact kind was {:?}, manifest declares {:?}", + observed_kind, artifact.kind + ), + ); + } + let observed_params = ManifestIndexParams::from_core(metadata.params); + if artifact.params != observed_params { + report.error( + "artifact_params_mismatch", + format!( + "artifact params were {:?}, manifest declares {:?}", + observed_params, artifact.params + ), + ); + } + if artifact.format_version != metadata.format_version { + report.error( + "artifact_format_version_mismatch", + format!( + "artifact format_version was {}, manifest declares {}", + metadata.format_version, artifact.format_version + ), + ); + } + if artifact.dim != metadata.dim { + report.error( + "artifact_dim_mismatch", + format!( + "artifact dim was {}, manifest declares {}", + metadata.dim, artifact.dim + ), + ); + } + if artifact.vector_count != metadata.vector_count { + report.error( + "artifact_vector_count_mismatch", + format!( + "artifact vector_count was {}, manifest declares {}", + metadata.vector_count, artifact.vector_count + ), + ); + } + if artifact.bytes_per_vec != metadata.bytes_per_vec { + report.error( + "artifact_bytes_per_vec_mismatch", + format!( + "artifact bytes_per_vec was {}, manifest declares {}", + metadata.bytes_per_vec, artifact.bytes_per_vec + ), + ); + } + if artifact.file_size_bytes != metadata.file_size_bytes { + report.error( + "artifact_metadata_file_size_mismatch", + format!( + "artifact metadata file_size_bytes was {}, manifest declares {}", + metadata.file_size_bytes, artifact.file_size_bytes + ), + ); + } +} + +fn verify_row_identity( + document: &ManifestDocument, + options: &VerifyOptions, + report: &mut VerificationReport, +) { + match &document.manifest.row_identity { + RowIdentity::RowIdIdentity { row_count } => { + report.row_identity.kind = Some("row_id_identity".to_string()); + report.row_identity.row_count = Some(*row_count); + } + RowIdentity::Jsonl { + path, + sha256, + row_count, + .. + } => { + report.row_identity.kind = Some("jsonl".to_string()); + report.row_identity.manifest_path = Some(path.clone()); + report.row_identity.row_count = Some(*row_count); + let row_path = PathBuf::from(path); + if let Some(resolved) = resolve_existing_path( + &row_path, + &document.base_dir, + options, + "row_identity", + &mut report.errors, + ) { + report.row_identity.canonical_path = + Some(path_to_display(&resolved.canonical_path)); + match sha256_file(&resolved.resolved_path) { + Ok(hash) => { + report.row_identity.sha256 = Some(hash.sha256.clone()); + if !hex_digest_eq(&hash.sha256, sha256) { + report.error( + "row_identity_sha256_mismatch", + format!( + "row_identity SHA-256 was {}, manifest declares {}", + hash.sha256, sha256 + ), + ); + } + } + Err(err) => report.error( + "row_identity_hash_failed", + format!("failed to hash row identity file: {err}"), + ), + } + + match validate_jsonl_rows( + &resolved.resolved_path, + options.allow_duplicate_db_ids, + &mut report.errors, + ) { + Ok(stats) => { + report.row_identity.validated_rows = Some(stats.row_count); + if stats.row_count != *row_count { + report.error( + "row_identity_row_count_mismatch", + format!( + "row identity file has {} rows, manifest declares {}", + stats.row_count, row_count + ), + ); + } + } + Err(err) => report.error( + "row_identity_read_failed", + format!("failed to read row identity file: {err}"), + ), + } + } + } + } +} + +fn verify_attestations(manifest: &IndexManifest, report: &mut VerificationReport) { + if manifest.attestations.is_empty() { + report + .skipped_checks + .push("attestations_absent".to_string()); + return; + } + + let artifact_sha = report + .artifact + .sha256 + .clone() + .unwrap_or_else(|| manifest.artifact.sha256.clone()); + let mut any_subject_match = false; + for (idx, attestation) in manifest.attestations.iter().enumerate() { + let predicate_type = attestation + .get("predicateType") + .or_else(|| attestation.get("predicate_type")) + .and_then(serde_json::Value::as_str) + .map(ToOwned::to_owned); + if predicate_type.is_none() { + report.error( + "attestation_predicate_type_missing", + format!("attestation {idx} has no predicateType"), + ); + } + + let builder_id = attestation + .pointer("/predicate/builder/id") + .or_else(|| attestation.pointer("/predicate/runDetails/builder/id")) + .and_then(serde_json::Value::as_str) + .map(ToOwned::to_owned); + + let subject_sha256_matched = attestation + .get("subject") + .and_then(serde_json::Value::as_array) + .is_some_and(|subjects| { + subjects.iter().any(|subject| { + subject + .pointer("/digest/sha256") + .and_then(serde_json::Value::as_str) + .is_some_and(|digest| hex_digest_eq(digest, &artifact_sha)) + }) + }); + any_subject_match |= subject_sha256_matched; + report.attestation_shape_checks.push(AttestationShapeCheck { + predicate_type, + builder_id, + subject_sha256_matched, + }); + } + + if !any_subject_match { + report.error( + "attestation_subject_sha256_mismatch", + "no supplied attestation subject digest matches the artifact SHA-256", + ); + } +} + +#[derive(Clone, Debug, Default)] +pub struct VerifyOptions { + pub allow_absolute_paths: bool, + pub allow_path_escape: bool, + pub allow_duplicate_db_ids: bool, + pub index_override: Option, +} + +#[derive(Clone, Debug)] +struct ResolvedPath { + resolved_path: PathBuf, + canonical_path: PathBuf, +} + +fn resolve_existing_path( + path: &Path, + base_dir: &Path, + options: &VerifyOptions, + context: &str, + errors: &mut Vec, +) -> Option { + if path.is_absolute() && !options.allow_absolute_paths { + errors.push(ReportIssue::new( + format!("{context}_absolute_path_rejected"), + format!("absolute path {} is rejected by default", path.display()), + )); + return None; + } + + let base_canonical = match fs::canonicalize(base_dir) { + Ok(path) => path, + Err(err) => { + errors.push(ReportIssue::new( + format!("{context}_base_dir_unavailable"), + format!( + "failed to canonicalize base_dir {}: {err}", + base_dir.display() + ), + )); + return None; + } + }; + + if !path.is_absolute() && !options.allow_path_escape && has_lexical_escape(path) { + errors.push(ReportIssue::new( + format!("{context}_path_escape_rejected"), + format!("relative path {} escapes the manifest base", path.display()), + )); + return None; + } + + let resolved_path = if path.is_absolute() { + path.to_path_buf() + } else { + base_dir.join(path) + }; + let canonical_path = match fs::canonicalize(&resolved_path) { + Ok(path) => path, + Err(err) => { + errors.push(ReportIssue::new( + format!("{context}_path_unavailable"), + format!("failed to canonicalize {}: {err}", resolved_path.display()), + )); + return None; + } + }; + + if !options.allow_path_escape && !canonical_path.starts_with(&base_canonical) { + errors.push(ReportIssue::new( + format!("{context}_path_escape_rejected"), + format!( + "canonical path {} is outside manifest base {}", + canonical_path.display(), + base_canonical.display() + ), + )); + return None; + } + + Some(ResolvedPath { + resolved_path, + canonical_path, + }) +} + +fn has_lexical_escape(path: &Path) -> bool { + let mut depth = 0usize; + for component in path.components() { + match component { + Component::CurDir => {} + Component::Normal(_) => depth += 1, + Component::ParentDir => { + if depth == 0 { + return true; + } + depth -= 1; + } + Component::Prefix(_) | Component::RootDir => return true, + } + } + false +} + +#[derive(Clone, Debug, Serialize, Deserialize)] +#[serde(deny_unknown_fields)] +pub struct IndexManifest { + pub schema_version: String, + pub manifest_id: String, + pub created_at: String, + pub artifact: Artifact, + pub embedding: Embedding, + pub row_identity: RowIdentity, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub build: Option, + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub attestations: Vec, + #[serde(default, skip_serializing_if = "BTreeMap::is_empty")] + pub extensions: BTreeMap, +} + +#[derive(Clone, Debug, Serialize, Deserialize)] +#[serde(deny_unknown_fields)] +pub struct Artifact { + pub path: String, + pub sha256: String, + pub kind: ManifestIndexKind, + pub format_version: u8, + pub dim: usize, + pub vector_count: usize, + pub bytes_per_vec: usize, + pub params: ManifestIndexParams, + pub file_size_bytes: u64, +} + +#[derive(Clone, Debug, Serialize, Deserialize)] +#[serde(deny_unknown_fields)] +pub struct Embedding { + pub model: String, + pub dim: usize, +} + +#[derive(Clone, Debug, Serialize, Deserialize)] +#[serde(deny_unknown_fields)] +pub struct BuildInfo { + pub invocation_id: String, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub builder_id: Option, +} + +#[derive(Clone, Debug, Serialize, Deserialize)] +#[serde(tag = "kind", rename_all = "snake_case", deny_unknown_fields)] +pub enum RowIdentity { + RowIdIdentity { + row_count: usize, + }, + Jsonl { + path: String, + sha256: String, + row_count: usize, + id_kind: String, + #[serde(default, skip_serializing_if = "Option::is_none")] + db: Option, + }, +} + +impl RowIdentity { + pub fn row_count(&self) -> usize { + match self { + Self::RowIdIdentity { row_count } | Self::Jsonl { row_count, .. } => *row_count, + } + } +} + +#[derive(Clone, Debug, Serialize, Deserialize)] +#[serde(deny_unknown_fields)] +pub struct RowIdentityDb { + #[serde(default, skip_serializing_if = "Option::is_none")] + pub path: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub table: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub id_column: Option, +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum ManifestIndexKind { + Rank, + RankQuant, + Bitmap, + SignBitmap, +} + +impl ManifestIndexKind { + fn from_core(kind: CoreIndexKind) -> Self { + match kind { + CoreIndexKind::Rank => Self::Rank, + CoreIndexKind::RankQuant => Self::RankQuant, + CoreIndexKind::Bitmap => Self::Bitmap, + CoreIndexKind::SignBitmap => Self::SignBitmap, + } + } +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] +#[serde(tag = "kind", rename_all = "snake_case", deny_unknown_fields)] +pub enum ManifestIndexParams { + Rank, + RankQuant { bits: u8 }, + Bitmap { n_top: usize }, + SignBitmap, +} + +impl ManifestIndexParams { + fn from_core(params: CoreIndexParams) -> Self { + match params { + CoreIndexParams::Rank => Self::Rank, + CoreIndexParams::RankQuant { bits } => Self::RankQuant { bits }, + CoreIndexParams::Bitmap { n_top } => Self::Bitmap { n_top }, + CoreIndexParams::SignBitmap => Self::SignBitmap, + } + } +} + +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct VerificationReport { + pub ok: bool, + pub checked_at: String, + pub manifest_id: Option, + pub artifact: ArtifactReport, + pub row_identity: RowIdentityReport, + pub attestation_shape_checks: Vec, + pub errors: Vec, + pub warnings: Vec, + pub skipped_checks: Vec, +} + +impl VerificationReport { + fn new(manifest_id: Option) -> Self { + Self { + ok: false, + checked_at: Utc::now().to_rfc3339_opts(SecondsFormat::Secs, true), + manifest_id, + artifact: ArtifactReport::default(), + row_identity: RowIdentityReport::default(), + attestation_shape_checks: Vec::new(), + errors: Vec::new(), + warnings: Vec::new(), + skipped_checks: Vec::new(), + } + } + + fn error(&mut self, code: impl Into, message: impl Into) { + self.errors.push(ReportIssue::new(code, message)); + } +} + +#[derive(Clone, Debug, Default, Serialize, Deserialize)] +pub struct ArtifactReport { + pub manifest_path: Option, + pub observed_path: Option, + pub canonical_path: Option, + pub sha256: Option, + pub size_bytes: Option, + pub metadata: Option, +} + +#[derive(Clone, Debug, Default, Serialize, Deserialize)] +pub struct RowIdentityReport { + pub kind: Option, + pub manifest_path: Option, + pub canonical_path: Option, + pub sha256: Option, + pub row_count: Option, + pub validated_rows: Option, +} + +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct MetadataReport { + pub kind: ManifestIndexKind, + pub format_version: u8, + pub dim: usize, + pub vector_count: usize, + pub bytes_per_vec: usize, + pub params: ManifestIndexParams, + pub file_size_bytes: u64, +} + +impl MetadataReport { + fn from_core(metadata: &CoreIndexMetadata) -> Self { + Self { + kind: ManifestIndexKind::from_core(metadata.kind), + format_version: metadata.format_version, + dim: metadata.dim, + vector_count: metadata.vector_count, + bytes_per_vec: metadata.bytes_per_vec, + params: ManifestIndexParams::from_core(metadata.params), + file_size_bytes: metadata.file_size_bytes, + } + } +} + +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct AttestationShapeCheck { + pub predicate_type: Option, + pub builder_id: Option, + pub subject_sha256_matched: bool, +} + +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct ReportIssue { + pub code: String, + pub message: String, +} + +impl ReportIssue { + pub fn new(code: impl Into, message: impl Into) -> Self { + Self { + code: code.into(), + message: message.into(), + } + } +} + +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct FileHash { + pub sha256: String, + pub size_bytes: u64, +} + +pub fn sha256_file(path: impl AsRef) -> io::Result { + let mut file = File::open(path)?; + let mut hasher = Sha256::new(); + let mut size_bytes = 0u64; + let mut buf = [0u8; 64 * 1024]; + loop { + let n = file.read(&mut buf)?; + if n == 0 { + break; + } + size_bytes += n as u64; + hasher.update(&buf[..n]); + } + Ok(FileHash { + sha256: hex::encode(hasher.finalize()), + size_bytes, + }) +} + +#[derive(Clone, Debug)] +pub enum CreateRowIdentity { + RowIdIdentity, + Jsonl(PathBuf), +} + +pub fn create_manifest_for_index( + index_path: impl AsRef, + row_identity: CreateRowIdentity, + embedding_model: impl Into, + out_path: impl AsRef, +) -> Result { + let index_path = index_path.as_ref(); + let out_path = out_path.as_ref(); + let out_base = out_path + .parent() + .filter(|p| !p.as_os_str().is_empty()) + .unwrap_or_else(|| Path::new(".")); + let metadata = probe_index_metadata(index_path)?; + let index_hash = sha256_file(index_path)?; + let artifact = Artifact { + path: manifest_relative_path(index_path, out_base), + sha256: index_hash.sha256, + kind: ManifestIndexKind::from_core(metadata.kind), + format_version: metadata.format_version, + dim: metadata.dim, + vector_count: metadata.vector_count, + bytes_per_vec: metadata.bytes_per_vec, + params: ManifestIndexParams::from_core(metadata.params), + file_size_bytes: metadata.file_size_bytes, + }; + + let row_identity = match row_identity { + CreateRowIdentity::RowIdIdentity => RowIdentity::RowIdIdentity { + row_count: metadata.vector_count, + }, + CreateRowIdentity::Jsonl(path) => { + let row_hash = sha256_file(&path)?; + let mut row_errors = Vec::new(); + let stats = validate_jsonl_rows(&path, false, &mut row_errors)?; + if !row_errors.is_empty() { + let codes = row_errors + .iter() + .map(|issue| issue.code.as_str()) + .collect::>() + .join(", "); + return Err(ManifestError::invalid(format!( + "row map is invalid: {codes}" + ))); + } + if stats.row_count != metadata.vector_count { + return Err(ManifestError::invalid(format!( + "row map has {} rows but index has {} vectors", + stats.row_count, metadata.vector_count + ))); + } + RowIdentity::Jsonl { + path: manifest_relative_path(&path, out_base), + sha256: row_hash.sha256, + row_count: stats.row_count, + id_kind: "uuid".to_string(), + db: None, + } + } + }; + + let invocation_id = format!("urn:uuid:{}", Uuid::new_v4()); + Ok(IndexManifest { + schema_version: SCHEMA_VERSION.to_string(), + manifest_id: format!("urn:uuid:{}", Uuid::new_v4()), + created_at: Utc::now().to_rfc3339_opts(SecondsFormat::Secs, true), + artifact, + embedding: Embedding { + model: embedding_model.into(), + dim: metadata.dim, + }, + row_identity, + build: Some(BuildInfo { + invocation_id, + builder_id: Some("ordvec-manifest".to_string()), + }), + attestations: Vec::new(), + extensions: BTreeMap::new(), + }) +} + +pub fn write_manifest_file( + manifest: &IndexManifest, + path: impl AsRef, +) -> Result<(), ManifestError> { + let file = File::create(path)?; + serde_json::to_writer_pretty(file, manifest)?; + Ok(()) +} + +#[derive(Clone, Debug)] +struct JsonlStats { + row_count: usize, +} + +#[derive(Debug, Deserialize)] +#[serde(deny_unknown_fields)] +struct JsonlRow { + row_id: usize, + db_id: String, + #[serde(default)] + parent_id: Option, +} + +fn validate_jsonl_rows( + path: &Path, + allow_duplicate_db_ids: bool, + errors: &mut Vec, +) -> io::Result { + let file = File::open(path)?; + let reader = BufReader::new(file); + let mut seen = HashSet::new(); + let mut row_count = 0usize; + + for (line_idx, line) in reader.lines().enumerate() { + let line = line?; + row_count += 1; + let row: JsonlRow = match serde_json::from_str(&line) { + Ok(row) => row, + Err(err) => { + errors.push(ReportIssue::new( + "row_identity_jsonl_invalid_json", + format!("line {line_idx} is not a strict row object: {err}"), + )); + continue; + } + }; + if row.row_id != line_idx { + errors.push(ReportIssue::new( + "row_identity_row_id_mismatch", + format!("line {line_idx} has row_id {}", row.row_id), + )); + } + validate_row_id_string("db_id", &row.db_id, line_idx, errors); + if let Some(parent_id) = &row.parent_id { + validate_row_id_string("parent_id", parent_id, line_idx, errors); + } + if !allow_duplicate_db_ids && !seen.insert(row.db_id) { + errors.push(ReportIssue::new( + "row_identity_duplicate_db_id", + format!("line {line_idx} repeats db_id"), + )); + } + } + + Ok(JsonlStats { row_count }) +} + +fn validate_row_id_string( + field: &str, + value: &str, + line_idx: usize, + errors: &mut Vec, +) { + if value.is_empty() { + errors.push(ReportIssue::new( + format!("row_identity_{field}_empty"), + format!("line {line_idx} has empty {field}"), + )); + } + if value.contains('\0') { + errors.push(ReportIssue::new( + format!("row_identity_{field}_contains_nul"), + format!("line {line_idx} {field} contains NUL"), + )); + } +} + +fn manifest_relative_path(path: &Path, base_dir: &Path) -> String { + let canonical_path = fs::canonicalize(path); + let canonical_base = fs::canonicalize(base_dir); + if let (Ok(canonical_path), Ok(canonical_base)) = (canonical_path, canonical_base) { + if let Ok(relative) = canonical_path.strip_prefix(&canonical_base) { + if !relative.as_os_str().is_empty() { + return path_to_manifest_string(relative); + } + } + } + path_to_manifest_string(path) +} + +fn path_to_manifest_string(path: &Path) -> String { + if path.is_absolute() { + return path.display().to_string(); + } + let parts = path + .components() + .filter_map(|component| match component { + Component::Normal(part) => Some(part.to_string_lossy().into_owned()), + Component::CurDir => Some(".".to_string()), + Component::ParentDir => Some("..".to_string()), + Component::Prefix(_) | Component::RootDir => None, + }) + .collect::>(); + if parts.is_empty() { + ".".to_string() + } else { + parts.join("/") + } +} + +fn path_to_display(path: &Path) -> String { + path.display().to_string() +} + +fn extension_key_is_namespaced(key: &str) -> bool { + if key.contains("://") || key.starts_with("urn:") { + return true; + } + let mut parts = key.split('.'); + let Some(first) = parts.next() else { + return false; + }; + if !valid_extension_part(first) { + return false; + } + let mut saw_second = false; + for part in parts { + saw_second = true; + if !valid_extension_part(part) { + return false; + } + } + saw_second +} + +fn valid_extension_part(part: &str) -> bool { + !part.is_empty() + && part + .bytes() + .all(|b| b.is_ascii_alphanumeric() || b == b'-' || b == b'_') + && part.bytes().any(|b| b.is_ascii_alphanumeric()) +} + +fn is_sha256_hex(value: &str) -> bool { + value.len() == 64 && value.bytes().all(|b| b.is_ascii_hexdigit()) +} + +fn hex_digest_eq(a: &str, b: &str) -> bool { + a.eq_ignore_ascii_case(b) +} + +#[cfg(feature = "sqlite")] +pub mod sqlite; diff --git a/ordvec-manifest/src/main.rs b/ordvec-manifest/src/main.rs new file mode 100644 index 0000000..0ad9948 --- /dev/null +++ b/ordvec-manifest/src/main.rs @@ -0,0 +1,312 @@ +use clap::{Parser, Subcommand}; +use ordvec_manifest::{ + create_manifest_for_index, load_manifest_file, sha256_file, verify_manifest, + write_manifest_file, CreateRowIdentity, ManifestDocument, ManifestError, VerifyOptions, +}; +use serde_json::json; +use std::fs; +use std::path::PathBuf; + +const EXIT_VERIFICATION_FAILED: i32 = 1; +const EXIT_USAGE_OR_CONFIG: i32 = 2; + +#[derive(Parser)] +#[command(name = "ordvec-manifest")] +#[command(about = "Verify ordvec index manifests", version)] +struct Cli { + #[command(subcommand)] + command: Commands, +} + +#[derive(Subcommand)] +enum Commands { + Hash { + path: PathBuf, + #[arg(long)] + json: bool, + }, + Inspect { + manifest: PathBuf, + #[arg(long)] + json: bool, + }, + Verify { + #[arg(long)] + manifest: PathBuf, + #[arg(long)] + index: Option, + #[arg(long)] + allow_absolute_paths: bool, + #[arg(long)] + allow_path_escape: bool, + #[arg(long)] + allow_duplicate_db_ids: bool, + #[arg(long)] + json: bool, + }, + Create { + #[arg(long)] + index: PathBuf, + #[arg(long)] + row_map: Option, + #[arg(long)] + row_id_is_identity: bool, + #[arg(long)] + embedding_model: String, + #[arg(long)] + out: PathBuf, + }, + #[cfg(feature = "sqlite")] + Sqlite { + #[command(subcommand)] + command: SqliteCommands, + }, +} + +#[cfg(feature = "sqlite")] +#[derive(Subcommand)] +enum SqliteCommands { + Verify { + #[arg(long)] + db: PathBuf, + #[arg(long)] + manifest: PathBuf, + #[arg(long)] + use_cache: bool, + #[arg(long)] + index: Option, + #[arg(long)] + allow_absolute_paths: bool, + #[arg(long)] + allow_path_escape: bool, + #[arg(long)] + allow_duplicate_db_ids: bool, + #[arg(long)] + json: bool, + }, + Activate { + #[arg(long)] + db: PathBuf, + #[arg(long)] + manifest: PathBuf, + #[arg(long)] + force: bool, + #[arg(long)] + index: Option, + #[arg(long)] + allow_absolute_paths: bool, + #[arg(long)] + allow_path_escape: bool, + #[arg(long)] + allow_duplicate_db_ids: bool, + #[arg(long)] + json: bool, + }, +} + +fn main() { + std::process::exit(match run() { + Ok(code) => code, + Err(err) => { + eprintln!("error: {err}"); + EXIT_USAGE_OR_CONFIG + } + }); +} + +fn run() -> Result { + let cli = Cli::parse(); + match cli.command { + Commands::Hash { + path, + json: as_json, + } => { + let hash = sha256_file(&path)?; + if as_json { + print_json(&json!({ + "path": path, + "sha256": hash.sha256, + "size_bytes": hash.size_bytes, + }))?; + } else { + println!("{} {}", hash.sha256, path.display()); + } + Ok(0) + } + Commands::Inspect { + manifest, + json: as_json, + } => { + let document = load_manifest_file(&manifest)?; + if as_json { + print_json(&document.manifest)?; + } else { + println!("manifest_id: {}", document.manifest.manifest_id); + println!("schema_version: {}", document.manifest.schema_version); + println!("artifact: {}", document.manifest.artifact.path); + println!("row_identity: {}", row_identity_label(&document)); + } + Ok(0) + } + Commands::Verify { + manifest, + index, + allow_absolute_paths, + allow_path_escape, + allow_duplicate_db_ids, + json: as_json, + } => { + let document = load_manifest_file(&manifest)?; + let report = verify_manifest( + &document, + VerifyOptions { + allow_absolute_paths, + allow_path_escape, + allow_duplicate_db_ids, + index_override: index, + }, + ); + emit_report(&report, as_json)?; + Ok(if report.ok { + 0 + } else { + EXIT_VERIFICATION_FAILED + }) + } + Commands::Create { + index, + row_map, + row_id_is_identity, + embedding_model, + out, + } => { + let row_identity = match (row_map, row_id_is_identity) { + (Some(_), true) => { + return Err(ManifestError::invalid( + "use either --row-map or --row-id-is-identity, not both", + )); + } + (Some(path), false) => CreateRowIdentity::Jsonl(path), + (None, true) => CreateRowIdentity::RowIdIdentity, + (None, false) => { + return Err(ManifestError::invalid( + "one of --row-map or --row-id-is-identity is required", + )); + } + }; + if let Some(parent) = out.parent().filter(|p| !p.as_os_str().is_empty()) { + fs::create_dir_all(parent)?; + } + let manifest = create_manifest_for_index(&index, row_identity, embedding_model, &out)?; + write_manifest_file(&manifest, &out)?; + println!("{}", out.display()); + Ok(0) + } + #[cfg(feature = "sqlite")] + Commands::Sqlite { command } => run_sqlite(command), + } +} + +#[cfg(feature = "sqlite")] +fn run_sqlite(command: SqliteCommands) -> Result { + match command { + SqliteCommands::Verify { + db, + manifest, + use_cache, + index, + allow_absolute_paths, + allow_path_escape, + allow_duplicate_db_ids, + json: as_json, + } => { + let document = load_manifest_file(&manifest)?; + let report = ordvec_manifest::sqlite::verify_with_registry( + &db, + &document, + &manifest, + VerifyOptions { + allow_absolute_paths, + allow_path_escape, + allow_duplicate_db_ids, + index_override: index, + }, + use_cache, + )?; + emit_report(&report, as_json)?; + Ok(if report.ok { + 0 + } else { + EXIT_VERIFICATION_FAILED + }) + } + SqliteCommands::Activate { + db, + manifest, + force, + index, + allow_absolute_paths, + allow_path_escape, + allow_duplicate_db_ids, + json: as_json, + } => { + let document = load_manifest_file(&manifest)?; + let report = ordvec_manifest::sqlite::activate( + &db, + &document, + &manifest, + VerifyOptions { + allow_absolute_paths, + allow_path_escape, + allow_duplicate_db_ids, + index_override: index, + }, + force, + )?; + emit_report(&report, as_json)?; + Ok(if report.ok || force { + 0 + } else { + EXIT_VERIFICATION_FAILED + }) + } + } +} + +fn emit_report( + report: &ordvec_manifest::VerificationReport, + as_json: bool, +) -> Result<(), ManifestError> { + if as_json { + print_json(report)?; + } else if report.ok { + println!( + "verified {}", + report + .manifest_id + .as_deref() + .unwrap_or("") + ); + } else { + for issue in &report.errors { + eprintln!("{}: {}", issue.code, issue.message); + } + } + Ok(()) +} + +fn print_json(value: &impl serde::Serialize) -> Result<(), ManifestError> { + let stdout = std::io::stdout(); + let mut lock = stdout.lock(); + serde_json::to_writer_pretty(&mut lock, value)?; + use std::io::Write; + lock.write_all(b"\n")?; + Ok(()) +} + +fn row_identity_label(document: &ManifestDocument) -> &'static str { + match document.manifest.row_identity { + ordvec_manifest::RowIdentity::RowIdIdentity { .. } => "row_id_identity", + ordvec_manifest::RowIdentity::Jsonl { .. } => "jsonl", + } +} diff --git a/ordvec-manifest/src/sqlite.rs b/ordvec-manifest/src/sqlite.rs new file mode 100644 index 0000000..9010ee1 --- /dev/null +++ b/ordvec-manifest/src/sqlite.rs @@ -0,0 +1,133 @@ +use crate::{verify_manifest, ManifestDocument, ManifestError, VerificationReport, VerifyOptions}; +use chrono::{SecondsFormat, Utc}; +use rusqlite::{params, Connection, OptionalExtension}; +use std::path::Path; + +pub fn verify_with_registry( + db_path: impl AsRef, + document: &ManifestDocument, + manifest_path: impl AsRef, + options: VerifyOptions, + use_cache: bool, +) -> Result { + let mut conn = Connection::open(db_path).map_err(sqlite_err)?; + init(&conn)?; + if use_cache { + return load_cached_report(&conn, &document.manifest.manifest_id)?.ok_or_else(|| { + ManifestError::invalid(format!( + "no cached verification report for manifest_id {}", + document.manifest.manifest_id + )) + }); + } + + let report = verify_manifest(document, options); + store_report(&mut conn, document, manifest_path.as_ref(), &report)?; + Ok(report) +} + +pub fn activate( + db_path: impl AsRef, + document: &ManifestDocument, + manifest_path: impl AsRef, + options: VerifyOptions, + force: bool, +) -> Result { + let mut conn = Connection::open(db_path).map_err(sqlite_err)?; + init(&conn)?; + let report = verify_manifest(document, options); + store_report(&mut conn, document, manifest_path.as_ref(), &report)?; + if !report.ok && !force { + return Ok(report); + } + + conn.execute( + "INSERT INTO active_manifest(id, manifest_id, manifest_path, activated_at, forced) + VALUES(1, ?1, ?2, ?3, ?4) + ON CONFLICT(id) DO UPDATE SET + manifest_id=excluded.manifest_id, + manifest_path=excluded.manifest_path, + activated_at=excluded.activated_at, + forced=excluded.forced", + params![ + document.manifest.manifest_id, + manifest_path.as_ref().display().to_string(), + Utc::now().to_rfc3339_opts(SecondsFormat::Secs, true), + i64::from(force), + ], + ) + .map_err(sqlite_err)?; + Ok(report) +} + +fn init(conn: &Connection) -> Result<(), ManifestError> { + conn.execute_batch( + "CREATE TABLE IF NOT EXISTS verification_reports( + manifest_id TEXT NOT NULL, + manifest_path TEXT NOT NULL, + checked_at TEXT NOT NULL, + ok INTEGER NOT NULL, + report_json TEXT NOT NULL, + PRIMARY KEY(manifest_id, checked_at) + ); + CREATE TABLE IF NOT EXISTS active_manifest( + id INTEGER PRIMARY KEY CHECK(id = 1), + manifest_id TEXT NOT NULL, + manifest_path TEXT NOT NULL, + activated_at TEXT NOT NULL, + forced INTEGER NOT NULL + );", + ) + .map_err(sqlite_err)?; + Ok(()) +} + +fn store_report( + conn: &mut Connection, + document: &ManifestDocument, + manifest_path: &Path, + report: &VerificationReport, +) -> Result<(), ManifestError> { + let tx = conn.transaction().map_err(sqlite_err)?; + let report_json = serde_json::to_string(report)?; + tx.execute( + "INSERT OR REPLACE INTO verification_reports( + manifest_id, manifest_path, checked_at, ok, report_json + ) VALUES(?1, ?2, ?3, ?4, ?5)", + params![ + document.manifest.manifest_id, + manifest_path.display().to_string(), + report.checked_at, + i64::from(report.ok), + report_json, + ], + ) + .map_err(sqlite_err)?; + tx.commit().map_err(sqlite_err)?; + Ok(()) +} + +fn load_cached_report( + conn: &Connection, + manifest_id: &str, +) -> Result, ManifestError> { + let report_json: Option = conn + .query_row( + "SELECT report_json + FROM verification_reports + WHERE manifest_id = ?1 + ORDER BY checked_at DESC + LIMIT 1", + params![manifest_id], + |row| row.get(0), + ) + .optional() + .map_err(sqlite_err)?; + report_json + .map(|json| serde_json::from_str(&json).map_err(ManifestError::from)) + .transpose() +} + +fn sqlite_err(err: rusqlite::Error) -> ManifestError { + ManifestError::invalid(format!("sqlite error: {err}")) +} diff --git a/ordvec-manifest/tests/manifest.rs b/ordvec-manifest/tests/manifest.rs new file mode 100644 index 0000000..b232aee --- /dev/null +++ b/ordvec-manifest/tests/manifest.rs @@ -0,0 +1,458 @@ +use ordvec::RankQuant; +use ordvec_manifest::{ + create_manifest_for_index, load_manifest_file, sha256_file, verify_manifest_with_base, + CreateRowIdentity, RowIdentity, VerifyOptions, +}; +use serde_json::json; +use std::fs; +use std::io::Write; +use std::path::{Path, PathBuf}; +use std::process::Command; + +fn write_index(dir: &Path) -> PathBuf { + let path = dir.join("index.tvrq"); + let mut index = RankQuant::new(16, 2); + let docs: Vec = (0..32).map(|i| i as f32 - 12.0).collect(); + index.add(&docs); + index.write(&path).unwrap(); + path +} + +fn write_row_map(path: &Path, rows: &[(&str, Option<&str>)]) { + let mut file = fs::File::create(path).unwrap(); + for (row_id, (db_id, parent_id)) in rows.iter().enumerate() { + let value = if let Some(parent_id) = parent_id { + json!({"row_id": row_id, "db_id": db_id, "parent_id": parent_id}) + } else { + json!({"row_id": row_id, "db_id": db_id}) + }; + writeln!(file, "{value}").unwrap(); + } +} + +fn identity_manifest(dir: &Path) -> (tempfile::TempDir, ordvec_manifest::IndexManifest, PathBuf) { + let temp = tempfile::tempdir_in(dir).unwrap(); + let index = write_index(temp.path()); + let manifest_path = temp.path().join("manifest.json"); + let manifest = create_manifest_for_index( + &index, + CreateRowIdentity::RowIdIdentity, + "test-embedding", + &manifest_path, + ) + .unwrap(); + (temp, manifest, manifest_path) +} + +#[test] +fn create_then_verify_identity_manifest() { + let temp = tempfile::tempdir().unwrap(); + let index = write_index(temp.path()); + let manifest_path = temp.path().join("manifest.json"); + let manifest = create_manifest_for_index( + &index, + CreateRowIdentity::RowIdIdentity, + "test-embedding", + &manifest_path, + ) + .unwrap(); + + let report = verify_manifest_with_base(manifest, temp.path(), VerifyOptions::default()); + assert!(report.ok, "{:?}", report.errors); + assert_eq!(report.skipped_checks, ["attestations_absent"]); + assert_eq!( + report.artifact.metadata.unwrap().kind, + ordvec_manifest::ManifestIndexKind::RankQuant + ); +} + +#[test] +fn schema_rejects_unknown_fields_and_bad_extension_keys() { + let root = tempfile::tempdir().unwrap(); + let (temp, mut manifest, _manifest_path) = identity_manifest(root.path()); + + let mut value = serde_json::to_value(&manifest).unwrap(); + value + .as_object_mut() + .unwrap() + .insert("unknown".to_string(), json!(true)); + let parsed = serde_json::from_value::(value); + assert!( + parsed.is_err(), + "schema-owned structs must reject unknown fields" + ); + + manifest + .extensions + .insert("policy".to_string(), json!({"decision": "deny"})); + let report = verify_manifest_with_base(manifest.clone(), temp.path(), VerifyOptions::default()); + assert!(report + .errors + .iter() + .any(|issue| issue.code == "extension_key_not_namespaced")); + + manifest.extensions.clear(); + manifest.extensions.insert( + "com.example.policy".to_string(), + json!({"decision": "allow"}), + ); + let report = verify_manifest_with_base(manifest, temp.path(), VerifyOptions::default()); + assert!(report.ok, "{:?}", report.errors); +} + +#[test] +fn artifact_metadata_mismatches_are_reported_with_stable_codes() { + let root = tempfile::tempdir().unwrap(); + let (temp, mut manifest, _manifest_path) = identity_manifest(root.path()); + manifest.artifact.dim += 1; + manifest.embedding.dim += 1; + + let report = verify_manifest_with_base(manifest, temp.path(), VerifyOptions::default()); + assert!(!report.ok); + assert!(report + .errors + .iter() + .any(|issue| issue.code == "artifact_dim_mismatch")); +} + +#[test] +fn path_policy_rejects_escapes_and_absolute_paths_by_default() { + let root = tempfile::tempdir().unwrap(); + let base = root.path().join("manifests"); + fs::create_dir(&base).unwrap(); + let index = write_index(root.path()); + let manifest_path = base.join("manifest.json"); + let mut manifest = create_manifest_for_index( + &index, + CreateRowIdentity::RowIdIdentity, + "test-embedding", + &manifest_path, + ) + .unwrap(); + + manifest.artifact.path = "../index.tvrq".to_string(); + let report = verify_manifest_with_base(manifest.clone(), &base, VerifyOptions::default()); + assert!(report + .errors + .iter() + .any(|issue| issue.code == "artifact_path_escape_rejected")); + + let report = verify_manifest_with_base( + manifest.clone(), + &base, + VerifyOptions { + allow_path_escape: true, + ..VerifyOptions::default() + }, + ); + assert!(report.ok, "{:?}", report.errors); + + manifest.artifact.path = index.display().to_string(); + let report = verify_manifest_with_base(manifest.clone(), &base, VerifyOptions::default()); + assert!(report + .errors + .iter() + .any(|issue| issue.code == "artifact_absolute_path_rejected")); + + let report = verify_manifest_with_base( + manifest, + &base, + VerifyOptions { + allow_absolute_paths: true, + allow_path_escape: true, + ..VerifyOptions::default() + }, + ); + assert!(report.ok, "{:?}", report.errors); +} + +#[cfg(unix)] +#[test] +fn symlink_escape_reports_observed_canonical_path() { + use std::os::unix::fs::symlink; + + let root = tempfile::tempdir().unwrap(); + let base = root.path().join("base"); + let outside = root.path().join("outside"); + fs::create_dir(&base).unwrap(); + fs::create_dir(&outside).unwrap(); + let index = write_index(&outside); + symlink(&index, base.join("link.tvrq")).unwrap(); + let manifest_path = base.join("manifest.json"); + let mut manifest = create_manifest_for_index( + &index, + CreateRowIdentity::RowIdIdentity, + "test-embedding", + &manifest_path, + ) + .unwrap(); + manifest.artifact.path = "link.tvrq".to_string(); + + let report = verify_manifest_with_base(manifest.clone(), &base, VerifyOptions::default()); + assert!(report + .errors + .iter() + .any(|issue| issue.code == "artifact_path_escape_rejected")); + + let report = verify_manifest_with_base( + manifest, + &base, + VerifyOptions { + allow_path_escape: true, + ..VerifyOptions::default() + }, + ); + assert!(report.ok, "{:?}", report.errors); + assert_eq!( + PathBuf::from(report.artifact.canonical_path.unwrap()), + fs::canonicalize(index).unwrap() + ); +} + +#[test] +fn jsonl_row_identity_is_strict_and_duplicate_ids_need_opt_in() { + let temp = tempfile::tempdir().unwrap(); + let index = write_index(temp.path()); + let rows = temp.path().join("rows.jsonl"); + write_row_map( + &rows, + &[ + ("00000000-0000-0000-0000-000000000001", None), + ("00000000-0000-0000-0000-000000000001", None), + ], + ); + let row_hash = sha256_file(&rows).unwrap(); + let manifest_path = temp.path().join("manifest.json"); + let mut manifest = create_manifest_for_index( + &index, + CreateRowIdentity::RowIdIdentity, + "test-embedding", + &manifest_path, + ) + .unwrap(); + manifest.row_identity = RowIdentity::Jsonl { + path: "rows.jsonl".to_string(), + sha256: row_hash.sha256, + row_count: 2, + id_kind: "uuid".to_string(), + db: None, + }; + + let report = verify_manifest_with_base(manifest.clone(), temp.path(), VerifyOptions::default()); + assert!(report + .errors + .iter() + .any(|issue| issue.code == "row_identity_duplicate_db_id")); + + let report = verify_manifest_with_base( + manifest, + temp.path(), + VerifyOptions { + allow_duplicate_db_ids: true, + ..VerifyOptions::default() + }, + ); + assert!(report.ok, "{:?}", report.errors); + + fs::write( + &rows, + "{\"row_id\":1,\"db_id\":\"\"}\n{\"row_id\":1,\"db_id\":\"ok\",\"extra\":true}\n", + ) + .unwrap(); + let row_hash = sha256_file(&rows).unwrap(); + let mut manifest = create_manifest_for_index( + &index, + CreateRowIdentity::RowIdIdentity, + "test-embedding", + &manifest_path, + ) + .unwrap(); + manifest.row_identity = RowIdentity::Jsonl { + path: "rows.jsonl".to_string(), + sha256: row_hash.sha256, + row_count: 2, + id_kind: "uuid".to_string(), + db: None, + }; + let report = verify_manifest_with_base(manifest, temp.path(), VerifyOptions::default()); + assert!(report + .errors + .iter() + .any(|issue| issue.code == "row_identity_jsonl_invalid_json")); + assert!(report + .errors + .iter() + .any(|issue| issue.code == "row_identity_row_id_mismatch")); +} + +#[test] +fn attestation_shape_requires_matching_subject_sha256() { + let root = tempfile::tempdir().unwrap(); + let (temp, mut manifest, _manifest_path) = identity_manifest(root.path()); + manifest.attestations.push(json!({ + "predicateType": "https://slsa.dev/provenance/v1", + "predicate": {"builder": {"id": "builder"}}, + "subject": [{"name": "index.tvrq", "digest": {"sha256": "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"}}] + })); + + let report = verify_manifest_with_base(manifest.clone(), temp.path(), VerifyOptions::default()); + assert!(report + .errors + .iter() + .any(|issue| issue.code == "attestation_subject_sha256_mismatch")); + + let sha = manifest.artifact.sha256.clone(); + manifest.attestations[0]["subject"][0]["digest"]["sha256"] = json!(sha); + let report = verify_manifest_with_base(manifest, temp.path(), VerifyOptions::default()); + assert!(report.ok, "{:?}", report.errors); + assert_eq!( + report.attestation_shape_checks[0].predicate_type.as_deref(), + Some("https://slsa.dev/provenance/v1") + ); +} + +#[test] +fn cli_create_verify_and_exit_codes() { + let temp = tempfile::tempdir().unwrap(); + let index = write_index(temp.path()); + let manifest = temp.path().join("manifest.json"); + let bin = env!("CARGO_BIN_EXE_ordvec-manifest"); + + let output = Command::new(bin) + .args([ + "create", + "--index", + index.to_str().unwrap(), + "--row-id-is-identity", + "--embedding-model", + "test-embedding", + "--out", + manifest.to_str().unwrap(), + ]) + .output() + .unwrap(); + assert!( + output.status.success(), + "{}", + String::from_utf8_lossy(&output.stderr) + ); + + let output = Command::new(bin) + .args(["verify", "--manifest", manifest.to_str().unwrap(), "--json"]) + .output() + .unwrap(); + assert!( + output.status.success(), + "{}", + String::from_utf8_lossy(&output.stderr) + ); + + let mut document = load_manifest_file(&manifest).unwrap(); + document.manifest.artifact.sha256 = + "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb".to_string(); + fs::write( + &manifest, + serde_json::to_string_pretty(&document.manifest).unwrap(), + ) + .unwrap(); + let output = Command::new(bin) + .args(["verify", "--manifest", manifest.to_str().unwrap()]) + .output() + .unwrap(); + assert_eq!(output.status.code(), Some(1)); + + let output = Command::new(bin) + .args([ + "create", + "--index", + index.to_str().unwrap(), + "--embedding-model", + "test-embedding", + "--out", + manifest.to_str().unwrap(), + ]) + .output() + .unwrap(); + assert_eq!(output.status.code(), Some(2)); +} + +#[cfg(feature = "sqlite")] +#[test] +fn sqlite_cache_is_explicit_and_activation_reverifies_by_default() { + use std::fs::OpenOptions; + + let temp = tempfile::tempdir().unwrap(); + let index = write_index(temp.path()); + let manifest_path = temp.path().join("manifest.json"); + let manifest = create_manifest_for_index( + &index, + CreateRowIdentity::RowIdIdentity, + "test-embedding", + &manifest_path, + ) + .unwrap(); + fs::write( + &manifest_path, + serde_json::to_string_pretty(&manifest).unwrap(), + ) + .unwrap(); + let document = load_manifest_file(&manifest_path).unwrap(); + let db = temp.path().join("registry.sqlite"); + + let report = ordvec_manifest::sqlite::verify_with_registry( + &db, + &document, + &manifest_path, + VerifyOptions::default(), + false, + ) + .unwrap(); + assert!(report.ok, "{:?}", report.errors); + + OpenOptions::new() + .append(true) + .open(&index) + .unwrap() + .write_all(b"\0") + .unwrap(); + + let cached = ordvec_manifest::sqlite::verify_with_registry( + &db, + &document, + &manifest_path, + VerifyOptions::default(), + true, + ) + .unwrap(); + assert!(cached.ok, "{:?}", cached.errors); + + let fresh = ordvec_manifest::sqlite::verify_with_registry( + &db, + &document, + &manifest_path, + VerifyOptions::default(), + false, + ) + .unwrap(); + assert!(!fresh.ok); + + let activation = ordvec_manifest::sqlite::activate( + &db, + &document, + &manifest_path, + VerifyOptions::default(), + false, + ) + .unwrap(); + assert!(!activation.ok); + + let forced = ordvec_manifest::sqlite::activate( + &db, + &document, + &manifest_path, + VerifyOptions::default(), + true, + ) + .unwrap(); + assert!(!forced.ok); +} diff --git a/src/lib.rs b/src/lib.rs index 24dd8b4..06ea646 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -63,6 +63,7 @@ mod util; pub use bitmap::Bitmap; pub use quant::{rankquant_eval_search, RankQuant}; pub use rank::Rank; +pub use rank_io::{probe_index_metadata, IndexKind, IndexMetadata, IndexParams}; pub use sign_bitmap::SignBitmap; // `search_asymmetric_byte_lut` is a bench-only scoring reference: it diff --git a/src/rank_io.rs b/src/rank_io.rs index 32be790..ae2adf3 100644 --- a/src/rank_io.rs +++ b/src/rank_io.rs @@ -65,6 +65,41 @@ const TVBM_MAGIC: &[u8; 4] = b"TVBM"; const TVSB_MAGIC: &[u8; 4] = b"TVSB"; const VERSION: u8 = 1; +/// Persisted index family identified from an on-disk ordvec index header. +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +pub enum IndexKind { + Rank, + RankQuant, + Bitmap, + SignBitmap, +} + +/// Format-specific parameters declared by an on-disk ordvec index header. +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +pub enum IndexParams { + Rank, + RankQuant { bits: u8 }, + Bitmap { n_top: usize }, + SignBitmap, +} + +/// Header-derived metadata for a persisted ordvec index. +/// +/// [`probe_index_metadata`] validates the fixed header, declared dimensions, +/// version, payload byte count, and exact file length, but deliberately does +/// not allocate or inspect the payload rows. Full row-invariant validation +/// remains the job of the index loaders. +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct IndexMetadata { + pub kind: IndexKind, + pub format_version: u8, + pub dim: usize, + pub vector_count: usize, + pub bytes_per_vec: usize, + pub params: IndexParams, + pub file_size_bytes: u64, +} + /// Largest accepted `dim` from a loaded file. Matches `u16::MAX` so the /// rank transform's `u16` invariant in [`crate::Rank`] is honoured. pub const MAX_DIM: usize = u16::MAX as usize; @@ -218,6 +253,187 @@ fn check_payload_bytes(payload_bytes: usize) -> io::Result<()> { Ok(()) } +fn read_u32_le(reader: &mut R) -> io::Result { + let mut buf = [0u8; 4]; + reader.read_exact(&mut buf)?; + Ok(u32::from_le_bytes(buf)) +} + +fn read_version(reader: &mut R, label: &str) -> io::Result { + let mut ver = [0u8; 1]; + reader.read_exact(&mut ver)?; + if ver[0] != VERSION { + return Err(invalid(format!("unsupported {label} version: {}", ver[0]))); + } + Ok(ver[0]) +} + +/// Probe an ordvec index file's fixed header and declared byte shape. +/// +/// This is the allocation-resistant metadata path used by external manifest +/// verification. It reads only the magic/version/parameter header plus file +/// metadata. It validates the same header domains as the full loaders and +/// requires the declared payload length to exactly match the remaining file +/// length, but it does not read or validate row payload invariants such as Rank +/// permutations, RankQuant constant composition, or Bitmap popcounts. +pub fn probe_index_metadata(path: impl AsRef) -> io::Result { + let file = File::open(path)?; + let file_size_bytes = file.metadata()?.len(); + let mut f = BufReader::new(file); + let mut magic = [0u8; 4]; + f.read_exact(&mut magic)?; + match &magic { + TVR_MAGIC => probe_rank_metadata(&mut f, file_size_bytes), + TVRQ_MAGIC => probe_rankquant_metadata(&mut f, file_size_bytes), + TVBM_MAGIC => probe_bitmap_metadata(&mut f, file_size_bytes), + TVSB_MAGIC => probe_sign_bitmap_metadata(&mut f, file_size_bytes), + _ => Err(invalid("unknown ordvec index magic")), + } +} + +fn probe_rank_metadata( + reader: &mut R, + file_size_bytes: u64, +) -> io::Result { + let format_version = read_version(reader, "TVR1")?; + let dim = read_u32_le(reader)? as usize; + check_dim(dim)?; + let vector_count = read_u32_le(reader)? as usize; + check_n_vectors(vector_count)?; + let bytes_per_vec = dim + .checked_mul(2) + .ok_or_else(|| invalid("bytes_per_vec overflows usize"))?; + let payload_bytes = vector_count + .checked_mul(bytes_per_vec) + .ok_or_else(|| invalid("payload size overflows usize"))?; + check_payload_bytes(payload_bytes)?; + check_payload_matches_file(reader, file_size_bytes, payload_bytes)?; + Ok(IndexMetadata { + kind: IndexKind::Rank, + format_version, + dim, + vector_count, + bytes_per_vec, + params: IndexParams::Rank, + file_size_bytes, + }) +} + +fn probe_rankquant_metadata( + reader: &mut R, + file_size_bytes: u64, +) -> io::Result { + let format_version = read_version(reader, "TVRQ")?; + let mut bits_buf = [0u8; 1]; + reader.read_exact(&mut bits_buf)?; + let bits = bits_buf[0]; + if !matches!(bits, 1 | 2 | 4) { + return Err(invalid(format!( + "unsupported TVRQ bits: {bits} (expected 1, 2, or 4)" + ))); + } + let dim = read_u32_le(reader)? as usize; + check_dim(dim)?; + let n_buckets = 1usize << bits; + if !dim.is_multiple_of(n_buckets) { + return Err(invalid(format!( + "TVRQ dim {dim} is not a multiple of 2^bits = {n_buckets}; \ + constant-composition invariant violated" + ))); + } + let codes_per_byte = (8 / bits) as usize; + if !dim.is_multiple_of(codes_per_byte) { + return Err(invalid(format!( + "TVRQ dim {dim} is not a multiple of codes_per_byte = {codes_per_byte}" + ))); + } + let vector_count = read_u32_le(reader)? as usize; + check_n_vectors(vector_count)?; + let payload_bytes = vector_count + .checked_mul(dim) + .and_then(|x| x.checked_mul(bits as usize)) + .map(|x| x / 8) + .ok_or_else(|| invalid("payload size overflows usize"))?; + check_payload_bytes(payload_bytes)?; + check_payload_matches_file(reader, file_size_bytes, payload_bytes)?; + let bytes_per_vec = dim + .checked_mul(bits as usize) + .map(|x| x / 8) + .ok_or_else(|| invalid("bytes_per_vec overflows usize"))?; + Ok(IndexMetadata { + kind: IndexKind::RankQuant, + format_version, + dim, + vector_count, + bytes_per_vec, + params: IndexParams::RankQuant { bits }, + file_size_bytes, + }) +} + +fn probe_bitmap_metadata( + reader: &mut R, + file_size_bytes: u64, +) -> io::Result { + let format_version = read_version(reader, "TVBM")?; + let dim = read_u32_le(reader)? as usize; + check_dim(dim)?; + if !dim.is_multiple_of(64) { + return Err(invalid(format!("TVBM dim {dim} is not a multiple of 64"))); + } + let n_top = read_u32_le(reader)? as usize; + if n_top == 0 || n_top >= dim { + return Err(invalid(format!( + "TVBM n_top {n_top} must satisfy 0 < n_top < dim ({dim})" + ))); + } + let vector_count = read_u32_le(reader)? as usize; + check_n_vectors(vector_count)?; + let qpv = dim / 64; + let payload_bytes = vector_count + .checked_mul(qpv) + .and_then(|x| x.checked_mul(8)) + .ok_or_else(|| invalid("payload size overflows usize"))?; + check_payload_bytes(payload_bytes)?; + check_payload_matches_file(reader, file_size_bytes, payload_bytes)?; + Ok(IndexMetadata { + kind: IndexKind::Bitmap, + format_version, + dim, + vector_count, + bytes_per_vec: dim / 8, + params: IndexParams::Bitmap { n_top }, + file_size_bytes, + }) +} + +fn probe_sign_bitmap_metadata( + reader: &mut R, + file_size_bytes: u64, +) -> io::Result { + let format_version = read_version(reader, "TVSB")?; + let dim = read_u32_le(reader)? as usize; + check_sign_bitmap_dim(dim)?; + let vector_count = read_u32_le(reader)? as usize; + check_n_vectors(vector_count)?; + let qpv = dim / 64; + let payload_bytes = vector_count + .checked_mul(qpv) + .and_then(|x| x.checked_mul(8)) + .ok_or_else(|| invalid("payload size overflows usize"))?; + check_payload_bytes(payload_bytes)?; + check_payload_matches_file(reader, file_size_bytes, payload_bytes)?; + Ok(IndexMetadata { + kind: IndexKind::SignBitmap, + format_version, + dim, + vector_count, + bytes_per_vec: dim / 8, + params: IndexParams::SignBitmap, + file_size_bytes, + }) +} + // ------------------------------------------------------------------- // Rank: u16 ranks per coordinate. // Header: magic(4) | version(1) | dim(u32 LE) | n_vectors(u32 LE) = 13 B @@ -661,9 +877,10 @@ pub(crate) fn load_sign_bitmap(path: impl AsRef) -> io::Result<(usize, usi #[cfg(test)] mod tests { use super::{ - load_rank, load_rankquant, write_bitmap, write_rank, write_rankquant, write_sign_bitmap, + load_bitmap, load_rank, load_rankquant, probe_index_metadata, write_bitmap, write_rank, + write_rankquant, write_sign_bitmap, IndexKind, IndexParams, MAX_DIM, MAX_VECTORS, VERSION, }; - use crate::Rank; + use crate::{Bitmap, Rank, RankQuant, SignBitmap}; use std::io::Write; use std::path::PathBuf; @@ -684,6 +901,182 @@ mod tests { p } + fn temp_index_path(suffix: &str) -> PathBuf { + let mut p = std::env::temp_dir(); + let nonce = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_nanos(); + p.push(format!( + "rank_io_probe_{}_{}_{}", + std::process::id(), + nonce, + suffix + )); + p + } + + #[test] + fn probe_metadata_matches_full_loaders_on_generated_fixtures() { + let mut paths = Vec::new(); + + let rank_path = temp_index_path("rank.tvr"); + let mut rank = Rank::new(8); + rank.add(&[ + 1.0, 3.0, 2.0, 4.0, 8.0, 7.0, 6.0, 5.0, 8.0, 6.0, 7.0, 5.0, 1.0, 2.0, 3.0, 4.0, + ]); + rank.write(&rank_path).unwrap(); + let meta = probe_index_metadata(&rank_path).unwrap(); + let loaded = Rank::load(&rank_path).unwrap(); + assert_eq!(meta.kind, IndexKind::Rank); + assert_eq!(meta.params, IndexParams::Rank); + assert_eq!(meta.format_version, VERSION); + assert_eq!(meta.dim, loaded.dim()); + assert_eq!(meta.vector_count, loaded.len()); + assert_eq!(meta.bytes_per_vec, loaded.bytes_per_vec()); + assert_eq!( + meta.file_size_bytes, + std::fs::metadata(&rank_path).unwrap().len() + ); + paths.push(rank_path); + + let quant_path = temp_index_path("rankquant.tvrq"); + let mut quant = RankQuant::new(16, 2); + let quant_docs: Vec = (0..32).map(|i| i as f32 - 11.0).collect(); + quant.add(&quant_docs); + quant.write(&quant_path).unwrap(); + let meta = probe_index_metadata(&quant_path).unwrap(); + let loaded = RankQuant::load(&quant_path).unwrap(); + assert_eq!(meta.kind, IndexKind::RankQuant); + assert_eq!( + meta.params, + IndexParams::RankQuant { + bits: loaded.bits() + } + ); + assert_eq!(meta.format_version, VERSION); + assert_eq!(meta.dim, loaded.dim()); + assert_eq!(meta.vector_count, loaded.len()); + assert_eq!(meta.bytes_per_vec, loaded.bytes_per_vec()); + assert_eq!( + meta.file_size_bytes, + std::fs::metadata(&quant_path).unwrap().len() + ); + paths.push(quant_path); + + let bitmap_path = temp_index_path("bitmap.tvbm"); + let mut bitmap = Bitmap::new(64, 16); + let bitmap_docs: Vec = (0..128).map(|i| ((i * 17) % 31) as f32).collect(); + bitmap.add(&bitmap_docs); + bitmap.write(&bitmap_path).unwrap(); + let meta = probe_index_metadata(&bitmap_path).unwrap(); + let loaded = Bitmap::load(&bitmap_path).unwrap(); + assert_eq!(meta.kind, IndexKind::Bitmap); + assert_eq!( + meta.params, + IndexParams::Bitmap { + n_top: loaded.n_top() + } + ); + assert_eq!(meta.format_version, VERSION); + assert_eq!(meta.dim, loaded.dim()); + assert_eq!(meta.vector_count, loaded.len()); + assert_eq!(meta.bytes_per_vec, loaded.bytes_per_vec()); + assert_eq!( + meta.file_size_bytes, + std::fs::metadata(&bitmap_path).unwrap().len() + ); + paths.push(bitmap_path); + + let sign_path = temp_index_path("sign_bitmap.tvsb"); + let mut sign = SignBitmap::new(64); + let sign_docs: Vec = (0usize..128) + .map(|i| if i.is_multiple_of(3) { 1.0 } else { -1.0 }) + .collect(); + sign.add(&sign_docs); + sign.write(&sign_path).unwrap(); + let meta = probe_index_metadata(&sign_path).unwrap(); + let loaded = SignBitmap::load(&sign_path).unwrap(); + assert_eq!(meta.kind, IndexKind::SignBitmap); + assert_eq!(meta.params, IndexParams::SignBitmap); + assert_eq!(meta.format_version, VERSION); + assert_eq!(meta.dim, loaded.dim()); + assert_eq!(meta.vector_count, loaded.len()); + assert_eq!(meta.bytes_per_vec, loaded.bytes_per_vec()); + assert_eq!( + meta.file_size_bytes, + std::fs::metadata(&sign_path).unwrap().len() + ); + paths.push(sign_path); + + for path in paths { + std::fs::remove_file(path).ok(); + } + } + + #[test] + fn probe_rejects_header_and_length_errors_without_payload_allocation() { + let wrong_magic = forge("wrong_magic", b"NOPE"); + let err = probe_index_metadata(&wrong_magic).unwrap_err(); + assert_eq!(err.kind(), std::io::ErrorKind::InvalidData); + std::fs::remove_file(&wrong_magic).ok(); + + let bad_version = forge("bad_version", b"TVR1\x09"); + let err = probe_index_metadata(&bad_version).unwrap_err(); + assert_eq!(err.kind(), std::io::ErrorKind::InvalidData); + std::fs::remove_file(&bad_version).ok(); + + let truncated = forge("truncated_header", b"TVR1\x01"); + let err = probe_index_metadata(&truncated).unwrap_err(); + assert_eq!(err.kind(), std::io::ErrorKind::UnexpectedEof); + std::fs::remove_file(&truncated).ok(); + + let mut length_mismatch = Vec::new(); + length_mismatch.extend_from_slice(b"TVR1"); + length_mismatch.push(VERSION); + length_mismatch.extend_from_slice(&8u32.to_le_bytes()); + length_mismatch.extend_from_slice(&1u32.to_le_bytes()); + let length_mismatch = forge("length_mismatch", &length_mismatch); + let err = probe_index_metadata(&length_mismatch).unwrap_err(); + assert_eq!(err.kind(), std::io::ErrorKind::InvalidData); + std::fs::remove_file(&length_mismatch).ok(); + + let mut huge_declared = Vec::new(); + huge_declared.extend_from_slice(b"TVR1"); + huge_declared.push(VERSION); + huge_declared.extend_from_slice(&(MAX_DIM as u32).to_le_bytes()); + huge_declared.extend_from_slice(&(MAX_VECTORS as u32).to_le_bytes()); + let huge_declared = forge("huge_declared", &huge_declared); + let err = probe_index_metadata(&huge_declared).unwrap_err(); + assert_eq!(err.kind(), std::io::ErrorKind::InvalidData); + assert!( + err.to_string().contains("MAX_PAYLOAD"), + "unexpected error: {err}" + ); + std::fs::remove_file(&huge_declared).ok(); + } + + #[test] + fn probe_does_not_validate_payload_row_invariants() { + let mut forged = Vec::new(); + forged.extend_from_slice(b"TVBM"); + forged.push(VERSION); + forged.extend_from_slice(&64u32.to_le_bytes()); + forged.extend_from_slice(&16u32.to_le_bytes()); + forged.extend_from_slice(&1u32.to_le_bytes()); + forged.extend_from_slice(&0u64.to_le_bytes()); + let path = forge("bad_bitmap_payload.tvbm", &forged); + + let meta = probe_index_metadata(&path).expect("probe reads only metadata"); + assert_eq!(meta.kind, IndexKind::Bitmap); + assert_eq!(meta.dim, 64); + assert_eq!(meta.vector_count, 1); + + let load_err = load_bitmap(&path).unwrap_err(); + assert_eq!(load_err.kind(), std::io::ErrorKind::InvalidData); + std::fs::remove_file(&path).ok(); + } + // ------------------------------------------------------------------- // Loader semantic-validation red-team (TV-DESER-004 / 005). Moved here // from tests/redteam_delta.rs when the rank_io read/write helpers became From 49f8528097ed1d3ff1f81616b78d2d2bd3b6f12d Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Fri, 29 May 2026 01:44:20 -0500 Subject: [PATCH 2/6] address manifest review findings Signed-off-by: Nelson Spence --- ordvec-manifest/src/lib.rs | 10 +- ordvec-manifest/src/main.rs | 2 +- ordvec-manifest/src/sqlite.rs | 279 ++++++++++++++++++++++++++++-- ordvec-manifest/tests/manifest.rs | 62 ++++++- src/rank_io.rs | 67 +++++++ 5 files changed, 396 insertions(+), 24 deletions(-) diff --git a/ordvec-manifest/src/lib.rs b/ordvec-manifest/src/lib.rs index b6d9263..f26b17f 100644 --- a/ordvec-manifest/src/lib.rs +++ b/ordvec-manifest/src/lib.rs @@ -61,7 +61,8 @@ pub struct ManifestDocument { pub fn load_manifest_file(path: impl AsRef) -> Result { let path = path.as_ref(); let file = File::open(path)?; - let manifest: IndexManifest = serde_json::from_reader(file)?; + let reader = BufReader::new(file); + let manifest: IndexManifest = serde_json::from_reader(reader)?; let base_dir = path .parent() .filter(|p| !p.as_os_str().is_empty()) @@ -746,7 +747,7 @@ impl VerificationReport { fn new(manifest_id: Option) -> Self { Self { ok: false, - checked_at: Utc::now().to_rfc3339_opts(SecondsFormat::Secs, true), + checked_at: Utc::now().to_rfc3339_opts(SecondsFormat::Nanos, true), manifest_id, artifact: ArtifactReport::default(), row_identity: RowIdentityReport::default(), @@ -872,6 +873,9 @@ pub fn create_manifest_for_index( .parent() .filter(|p| !p.as_os_str().is_empty()) .unwrap_or_else(|| Path::new(".")); + if !out_base.exists() { + fs::create_dir_all(out_base)?; + } let metadata = probe_index_metadata(index_path)?; let index_hash = sha256_file(index_path)?; let artifact = Artifact { @@ -1042,7 +1046,7 @@ fn manifest_relative_path(path: &Path, base_dir: &Path) -> String { fn path_to_manifest_string(path: &Path) -> String { if path.is_absolute() { - return path.display().to_string(); + return path.display().to_string().replace('\\', "/"); } let parts = path .components() diff --git a/ordvec-manifest/src/main.rs b/ordvec-manifest/src/main.rs index 0ad9948..29dd426 100644 --- a/ordvec-manifest/src/main.rs +++ b/ordvec-manifest/src/main.rs @@ -264,7 +264,7 @@ fn run_sqlite(command: SqliteCommands) -> Result { force, )?; emit_report(&report, as_json)?; - Ok(if report.ok || force { + Ok(if report.ok { 0 } else { EXIT_VERIFICATION_FAILED diff --git a/ordvec-manifest/src/sqlite.rs b/ordvec-manifest/src/sqlite.rs index 9010ee1..3a22cac 100644 --- a/ordvec-manifest/src/sqlite.rs +++ b/ordvec-manifest/src/sqlite.rs @@ -1,7 +1,12 @@ -use crate::{verify_manifest, ManifestDocument, ManifestError, VerificationReport, VerifyOptions}; +use crate::{ + resolve_existing_path, sha256_file, verify_manifest, ManifestDocument, ManifestError, + ReportIssue, RowIdentity, VerificationReport, VerifyOptions, +}; use chrono::{SecondsFormat, Utc}; use rusqlite::{params, Connection, OptionalExtension}; -use std::path::Path; +use serde::Serialize; +use sha2::{Digest, Sha256}; +use std::path::{Path, PathBuf}; pub fn verify_with_registry( db_path: impl AsRef, @@ -13,16 +18,26 @@ pub fn verify_with_registry( let mut conn = Connection::open(db_path).map_err(sqlite_err)?; init(&conn)?; if use_cache { - return load_cached_report(&conn, &document.manifest.manifest_id)?.ok_or_else(|| { - ManifestError::invalid(format!( - "no cached verification report for manifest_id {}", - document.manifest.manifest_id - )) - }); + if let Some(cache_key) = current_cache_key(document, manifest_path.as_ref(), &options)? { + if let Some(report) = + load_cached_report(&conn, &document.manifest.manifest_id, &cache_key)? + { + return Ok(report); + } + } } + let store_options = options.clone(); let report = verify_manifest(document, options); - store_report(&mut conn, document, manifest_path.as_ref(), &report)?; + let cache_key = + cache_key_from_report(manifest_path.as_ref(), &report, document, &store_options)?; + store_report( + &mut conn, + document, + manifest_path.as_ref(), + &report, + cache_key.as_ref(), + )?; Ok(report) } @@ -35,8 +50,17 @@ pub fn activate( ) -> Result { let mut conn = Connection::open(db_path).map_err(sqlite_err)?; init(&conn)?; + let store_options = options.clone(); let report = verify_manifest(document, options); - store_report(&mut conn, document, manifest_path.as_ref(), &report)?; + let cache_key = + cache_key_from_report(manifest_path.as_ref(), &report, document, &store_options)?; + store_report( + &mut conn, + document, + manifest_path.as_ref(), + &report, + cache_key.as_ref(), + )?; if !report.ok && !force { return Ok(report); } @@ -52,7 +76,7 @@ pub fn activate( params![ document.manifest.manifest_id, manifest_path.as_ref().display().to_string(), - Utc::now().to_rfc3339_opts(SecondsFormat::Secs, true), + Utc::now().to_rfc3339_opts(SecondsFormat::Nanos, true), i64::from(force), ], ) @@ -61,15 +85,52 @@ pub fn activate( } fn init(conn: &Connection) -> Result<(), ManifestError> { + if verification_reports_needs_migration(conn)? { + conn.execute_batch( + "ALTER TABLE verification_reports RENAME TO verification_reports_old; + CREATE TABLE verification_reports( + report_id INTEGER PRIMARY KEY AUTOINCREMENT, + manifest_id TEXT NOT NULL, + manifest_path TEXT NOT NULL, + checked_at TEXT NOT NULL, + ok INTEGER NOT NULL, + manifest_sha256 TEXT, + options_sha256 TEXT, + artifact_sha256 TEXT, + row_identity_sha256 TEXT, + report_json TEXT NOT NULL + ); + INSERT INTO verification_reports( + manifest_id, manifest_path, checked_at, ok, report_json + ) + SELECT manifest_id, manifest_path, checked_at, ok, report_json + FROM verification_reports_old; + DROP TABLE verification_reports_old;", + ) + .map_err(sqlite_err)?; + } conn.execute_batch( "CREATE TABLE IF NOT EXISTS verification_reports( + report_id INTEGER PRIMARY KEY AUTOINCREMENT, manifest_id TEXT NOT NULL, manifest_path TEXT NOT NULL, checked_at TEXT NOT NULL, ok INTEGER NOT NULL, - report_json TEXT NOT NULL, - PRIMARY KEY(manifest_id, checked_at) + manifest_sha256 TEXT, + options_sha256 TEXT, + artifact_sha256 TEXT, + row_identity_sha256 TEXT, + report_json TEXT NOT NULL ); + CREATE INDEX IF NOT EXISTS verification_reports_cache_idx + ON verification_reports( + manifest_id, + manifest_sha256, + options_sha256, + artifact_sha256, + row_identity_sha256, + report_id + ); CREATE TABLE IF NOT EXISTS active_manifest( id INTEGER PRIMARY KEY CHECK(id = 1), manifest_id TEXT NOT NULL, @@ -87,18 +148,31 @@ fn store_report( document: &ManifestDocument, manifest_path: &Path, report: &VerificationReport, + cache_key: Option<&CacheKey>, ) -> Result<(), ManifestError> { let tx = conn.transaction().map_err(sqlite_err)?; let report_json = serde_json::to_string(report)?; tx.execute( - "INSERT OR REPLACE INTO verification_reports( - manifest_id, manifest_path, checked_at, ok, report_json - ) VALUES(?1, ?2, ?3, ?4, ?5)", + "INSERT INTO verification_reports( + manifest_id, + manifest_path, + checked_at, + ok, + manifest_sha256, + options_sha256, + artifact_sha256, + row_identity_sha256, + report_json + ) VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9)", params![ document.manifest.manifest_id, manifest_path.display().to_string(), report.checked_at, i64::from(report.ok), + cache_key.map(|key| key.manifest_sha256.as_str()), + cache_key.map(|key| key.options_sha256.as_str()), + cache_key.map(|key| key.artifact_sha256.as_str()), + cache_key.and_then(|key| key.row_identity_sha256.as_deref()), report_json, ], ) @@ -110,15 +184,29 @@ fn store_report( fn load_cached_report( conn: &Connection, manifest_id: &str, + cache_key: &CacheKey, ) -> Result, ManifestError> { let report_json: Option = conn .query_row( "SELECT report_json FROM verification_reports WHERE manifest_id = ?1 - ORDER BY checked_at DESC + AND manifest_sha256 = ?2 + AND options_sha256 = ?3 + AND artifact_sha256 = ?4 + AND ( + (row_identity_sha256 IS NULL AND ?5 IS NULL) + OR row_identity_sha256 = ?5 + ) + ORDER BY report_id DESC LIMIT 1", - params![manifest_id], + params![ + manifest_id, + cache_key.manifest_sha256.as_str(), + cache_key.options_sha256.as_str(), + cache_key.artifact_sha256.as_str(), + cache_key.row_identity_sha256.as_deref(), + ], |row| row.get(0), ) .optional() @@ -128,6 +216,161 @@ fn load_cached_report( .transpose() } +#[derive(Clone, Debug)] +struct CacheKey { + manifest_sha256: String, + options_sha256: String, + artifact_sha256: String, + row_identity_sha256: Option, +} + +#[derive(Serialize)] +struct CacheableVerifyOptions { + allow_absolute_paths: bool, + allow_path_escape: bool, + allow_duplicate_db_ids: bool, + index_override: Option, +} + +impl CacheableVerifyOptions { + fn from_options(options: &VerifyOptions) -> Self { + Self { + allow_absolute_paths: options.allow_absolute_paths, + allow_path_escape: options.allow_path_escape, + allow_duplicate_db_ids: options.allow_duplicate_db_ids, + index_override: options + .index_override + .as_ref() + .map(|path| path.display().to_string().replace('\\', "/")), + } + } +} + +fn current_cache_key( + document: &ManifestDocument, + manifest_path: &Path, + options: &VerifyOptions, +) -> Result, ManifestError> { + let manifest_sha256 = match sha256_file(manifest_path) { + Ok(hash) => hash.sha256, + Err(_) => return Ok(None), + }; + let options_json = serde_json::to_vec(&CacheableVerifyOptions::from_options(options))?; + let options_sha256 = sha256_bytes(&options_json); + + let artifact_path = options + .index_override + .as_ref() + .cloned() + .unwrap_or_else(|| PathBuf::from(&document.manifest.artifact.path)); + let mut path_errors = Vec::::new(); + let Some(artifact) = resolve_existing_path( + &artifact_path, + &document.base_dir, + options, + "artifact", + &mut path_errors, + ) else { + return Ok(None); + }; + let artifact_sha256 = match sha256_file(&artifact.resolved_path) { + Ok(hash) => hash.sha256, + Err(_) => return Ok(None), + }; + + let row_identity_sha256 = match &document.manifest.row_identity { + RowIdentity::RowIdIdentity { .. } => None, + RowIdentity::Jsonl { path, .. } => { + let row_path = PathBuf::from(path); + let Some(row_identity) = resolve_existing_path( + &row_path, + &document.base_dir, + options, + "row_identity", + &mut path_errors, + ) else { + return Ok(None); + }; + match sha256_file(&row_identity.resolved_path) { + Ok(hash) => Some(hash.sha256), + Err(_) => return Ok(None), + } + } + }; + + Ok(Some(CacheKey { + manifest_sha256, + options_sha256, + artifact_sha256, + row_identity_sha256, + })) +} + +fn cache_key_from_report( + manifest_path: &Path, + report: &VerificationReport, + document: &ManifestDocument, + options: &VerifyOptions, +) -> Result, ManifestError> { + let manifest_sha256 = match sha256_file(manifest_path) { + Ok(hash) => hash.sha256, + Err(_) => return Ok(None), + }; + let options_json = serde_json::to_vec(&CacheableVerifyOptions::from_options(options))?; + let options_sha256 = sha256_bytes(&options_json); + let Some(artifact_sha256) = report.artifact.sha256.clone() else { + return Ok(None); + }; + let row_identity_sha256 = match &document.manifest.row_identity { + RowIdentity::RowIdIdentity { .. } => None, + RowIdentity::Jsonl { .. } => { + let Some(sha256) = report.row_identity.sha256.clone() else { + return Ok(None); + }; + Some(sha256) + } + }; + Ok(Some(CacheKey { + manifest_sha256, + options_sha256, + artifact_sha256, + row_identity_sha256, + })) +} + +fn sha256_bytes(bytes: &[u8]) -> String { + let mut hasher = Sha256::new(); + hasher.update(bytes); + hex::encode(hasher.finalize()) +} + +fn verification_reports_needs_migration(conn: &Connection) -> Result { + let exists: Option = conn + .query_row( + "SELECT 1 + FROM sqlite_master + WHERE type = 'table' AND name = 'verification_reports'", + [], + |row| row.get(0), + ) + .optional() + .map_err(sqlite_err)?; + if exists.is_none() { + return Ok(false); + } + + let mut stmt = conn + .prepare("PRAGMA table_info(verification_reports)") + .map_err(sqlite_err)?; + let columns = stmt + .query_map([], |row| row.get::<_, String>(1)) + .map_err(sqlite_err)? + .collect::, _>>() + .map_err(sqlite_err)?; + Ok(!columns.iter().any(|column| column == "report_id") + || !columns.iter().any(|column| column == "manifest_sha256")) +} + fn sqlite_err(err: rusqlite::Error) -> ManifestError { ManifestError::invalid(format!("sqlite error: {err}")) } diff --git a/ordvec-manifest/tests/manifest.rs b/ordvec-manifest/tests/manifest.rs index b232aee..b1b2bb8 100644 --- a/ordvec-manifest/tests/manifest.rs +++ b/ordvec-manifest/tests/manifest.rs @@ -66,6 +66,24 @@ fn create_then_verify_identity_manifest() { ); } +#[test] +fn create_manifest_creates_output_parent_for_programmatic_callers() { + let temp = tempfile::tempdir().unwrap(); + let index = write_index(temp.path()); + let manifest_path = temp.path().join("nested").join("manifest.json"); + + let manifest = create_manifest_for_index( + &index, + CreateRowIdentity::RowIdIdentity, + "test-embedding", + &manifest_path, + ) + .unwrap(); + + assert!(manifest_path.parent().unwrap().is_dir()); + assert_eq!(manifest.row_identity.row_count(), 2); +} + #[test] fn schema_rejects_unknown_fields_and_bad_extension_keys() { let root = tempfile::tempdir().unwrap(); @@ -379,6 +397,7 @@ fn cli_create_verify_and_exit_codes() { #[cfg(feature = "sqlite")] #[test] fn sqlite_cache_is_explicit_and_activation_reverifies_by_default() { + use rusqlite::Connection; use std::fs::OpenOptions; let temp = tempfile::tempdir().unwrap(); @@ -404,11 +423,32 @@ fn sqlite_cache_is_explicit_and_activation_reverifies_by_default() { &document, &manifest_path, VerifyOptions::default(), - false, + true, ) .unwrap(); assert!(report.ok, "{:?}", report.errors); + let second_fresh = ordvec_manifest::sqlite::verify_with_registry( + &db, + &document, + &manifest_path, + VerifyOptions::default(), + false, + ) + .unwrap(); + assert!(second_fresh.ok, "{:?}", second_fresh.errors); + + let conn = Connection::open(&db).unwrap(); + let count: i64 = conn + .query_row("SELECT COUNT(*) FROM verification_reports", [], |row| { + row.get(0) + }) + .unwrap(); + assert!( + count >= 2, + "rapid verifications must preserve report history" + ); + OpenOptions::new() .append(true) .open(&index) @@ -424,7 +464,10 @@ fn sqlite_cache_is_explicit_and_activation_reverifies_by_default() { true, ) .unwrap(); - assert!(cached.ok, "{:?}", cached.errors); + assert!( + !cached.ok, + "cache key mismatch must force fresh verification" + ); let fresh = ordvec_manifest::sqlite::verify_with_registry( &db, @@ -455,4 +498,19 @@ fn sqlite_cache_is_explicit_and_activation_reverifies_by_default() { ) .unwrap(); assert!(!forced.ok); + + let bin = env!("CARGO_BIN_EXE_ordvec-manifest"); + let output = Command::new(bin) + .args([ + "sqlite", + "activate", + "--db", + db.to_str().unwrap(), + "--manifest", + manifest_path.to_str().unwrap(), + "--force", + ]) + .output() + .unwrap(); + assert_eq!(output.status.code(), Some(1)); } diff --git a/src/rank_io.rs b/src/rank_io.rs index ae2adf3..a297c5f 100644 --- a/src/rank_io.rs +++ b/src/rank_io.rs @@ -1056,6 +1056,73 @@ mod tests { std::fs::remove_file(&huge_declared).ok(); } + #[test] + fn probe_rejects_format_specific_header_errors() { + let mut bad_bits = Vec::new(); + bad_bits.extend_from_slice(b"TVRQ"); + bad_bits.push(VERSION); + bad_bits.push(3); + bad_bits.extend_from_slice(&8u32.to_le_bytes()); + bad_bits.extend_from_slice(&0u32.to_le_bytes()); + let path = forge("probe_bad_bits.tvrq", &bad_bits); + assert_eq!( + probe_index_metadata(&path).unwrap_err().kind(), + std::io::ErrorKind::InvalidData + ); + std::fs::remove_file(&path).ok(); + + let mut bad_rq_dim = Vec::new(); + bad_rq_dim.extend_from_slice(b"TVRQ"); + bad_rq_dim.push(VERSION); + bad_rq_dim.push(4); + bad_rq_dim.extend_from_slice(&8u32.to_le_bytes()); + bad_rq_dim.extend_from_slice(&0u32.to_le_bytes()); + let path = forge("probe_bad_rq_dim.tvrq", &bad_rq_dim); + assert_eq!( + probe_index_metadata(&path).unwrap_err().kind(), + std::io::ErrorKind::InvalidData + ); + std::fs::remove_file(&path).ok(); + + let mut bad_bitmap_dim = Vec::new(); + bad_bitmap_dim.extend_from_slice(b"TVBM"); + bad_bitmap_dim.push(VERSION); + bad_bitmap_dim.extend_from_slice(&100u32.to_le_bytes()); + bad_bitmap_dim.extend_from_slice(&10u32.to_le_bytes()); + bad_bitmap_dim.extend_from_slice(&0u32.to_le_bytes()); + let path = forge("probe_bad_bitmap_dim.tvbm", &bad_bitmap_dim); + assert_eq!( + probe_index_metadata(&path).unwrap_err().kind(), + std::io::ErrorKind::InvalidData + ); + std::fs::remove_file(&path).ok(); + + let mut bad_n_top = Vec::new(); + bad_n_top.extend_from_slice(b"TVBM"); + bad_n_top.push(VERSION); + bad_n_top.extend_from_slice(&64u32.to_le_bytes()); + bad_n_top.extend_from_slice(&64u32.to_le_bytes()); + bad_n_top.extend_from_slice(&0u32.to_le_bytes()); + let path = forge("probe_bad_n_top.tvbm", &bad_n_top); + assert_eq!( + probe_index_metadata(&path).unwrap_err().kind(), + std::io::ErrorKind::InvalidData + ); + std::fs::remove_file(&path).ok(); + + let mut bad_sign_dim = Vec::new(); + bad_sign_dim.extend_from_slice(b"TVSB"); + bad_sign_dim.push(VERSION); + bad_sign_dim.extend_from_slice(&32u32.to_le_bytes()); + bad_sign_dim.extend_from_slice(&0u32.to_le_bytes()); + let path = forge("probe_bad_sign_dim.tvsb", &bad_sign_dim); + assert_eq!( + probe_index_metadata(&path).unwrap_err().kind(), + std::io::ErrorKind::InvalidData + ); + std::fs::remove_file(&path).ok(); + } + #[test] fn probe_does_not_validate_payload_row_invariants() { let mut forged = Vec::new(); From b5fbe38caf4f51e09358533d2dc3ad4da55653e1 Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Fri, 29 May 2026 09:45:50 -0500 Subject: [PATCH 3/6] tighten manifest v1 merge gates Signed-off-by: Nelson Spence --- CHANGELOG.md | 3 +- docs/INDEX_PROVENANCE.md | 5 + ordvec-manifest/README.md | 13 ++ ordvec-manifest/src/lib.rs | 217 +++++++++++++++++++-- ordvec-manifest/src/main.rs | 24 ++- ordvec-manifest/src/sqlite.rs | 15 +- ordvec-manifest/tests/manifest.rs | 313 +++++++++++++++++++++++++++--- 7 files changed, 543 insertions(+), 47 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9845b31..e565531 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,7 +13,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 `Bitmap`, and `SignBitmap` headers without allocating payloads. - Added the repo-local, publish=false `ordvec-manifest` crate with a strict v1 JSON schema, artifact and row-identity verification, attestation shape - checks, a CLI, and optional SQLite registry/cache support. + checks, a CLI, and optional SQLite cache/audit support with one active + manifest pointer. ### Documentation diff --git a/docs/INDEX_PROVENANCE.md b/docs/INDEX_PROVENANCE.md index 4dc2c3f..91795f8 100644 --- a/docs/INDEX_PROVENANCE.md +++ b/docs/INDEX_PROVENANCE.md @@ -47,6 +47,11 @@ that pre-load step: cargo run -p ordvec-manifest -- verify --manifest path/to/index.manifest.json ``` +The `create` command emits default-verifiable manifests by default: artifact +and row-identity paths must resolve under the output manifest directory. If a +deployment intentionally keeps those files outside that directory, create with +`--allow-path-escape` and verify with the matching path-policy flag. + The manifest verifier checks: - the index bytes against the manifest's SHA-256 digest; diff --git a/ordvec-manifest/README.md b/ordvec-manifest/README.md index b333cac..b9167fe 100644 --- a/ordvec-manifest/README.md +++ b/ordvec-manifest/README.md @@ -20,3 +20,16 @@ cargo run -p ordvec-manifest -- verify --manifest path/to/index.manifest.json The schema version is `ordvec.index_manifest.v1`. Relative paths resolve from the manifest file's directory, absolute paths are rejected by default, and relative paths may not escape the manifest directory unless explicitly allowed. +`create` follows the same policy: by default it emits only paths that should +verify with default settings. If an artifact or JSONL row map lives outside the +manifest directory, pass `--allow-path-escape` at create time and again at +verify time. + +With `--features sqlite`, the `sqlite verify` and `sqlite activate` subcommands +add a local cache/audit log plus one active-manifest pointer. This is not a +full named registry. `sqlite verify --use-cache` reuses only reports whose +manifest, verification options, artifact bytes, and row-identity bytes still +match; otherwise it runs fresh verification and stores a new report. `sqlite +activate --force` writes the active pointer even when verification fails, emits +a `sqlite_activation_forced` warning in JSON output, and exits zero because it +did mutate activation state. diff --git a/ordvec-manifest/src/lib.rs b/ordvec-manifest/src/lib.rs index f26b17f..5798814 100644 --- a/ordvec-manifest/src/lib.rs +++ b/ordvec-manifest/src/lib.rs @@ -88,6 +88,16 @@ pub fn verify_manifest_with_base( verify_manifest(&document, options) } +pub fn verify_index_manifest( + index_path: impl Into, + manifest_path: impl AsRef, + mut options: VerifyOptions, +) -> Result { + let document = load_manifest_file(manifest_path)?; + options.index_override = Some(index_path.into()); + Ok(verify_manifest(&document, options)) +} + pub fn verify_manifest(document: &ManifestDocument, options: VerifyOptions) -> VerificationReport { let mut report = VerificationReport::new(Some(document.manifest.manifest_id.clone())); validate_manifest_shape(&document.manifest, &mut report); @@ -189,7 +199,7 @@ fn validate_manifest_shape(manifest: &IndexManifest, report: &mut VerificationRe if !is_sha256_hex(&manifest.artifact.sha256) { report.error( "artifact_sha256_invalid", - "artifact.sha256 must be a 64-character hex SHA-256 digest", + "artifact.sha256 must be a lowercase 64-character hex SHA-256 digest", ); } if manifest.artifact.bytes_per_vec == 0 { @@ -240,7 +250,7 @@ fn validate_manifest_shape(manifest: &IndexManifest, report: &mut VerificationRe if !is_sha256_hex(sha256) { report.error( "row_identity_sha256_invalid", - "row_identity.sha256 must be a 64-character hex SHA-256 digest", + "row_identity.sha256 must be a lowercase 64-character hex SHA-256 digest", ); } if id_kind != "uuid" { @@ -251,6 +261,31 @@ fn validate_manifest_shape(manifest: &IndexManifest, report: &mut VerificationRe } } + validate_optional_non_empty( + "embedding_model_revision_empty", + "embedding.model_revision must be non-empty when present", + manifest.embedding.model_revision.as_deref(), + report, + ); + validate_optional_sha256( + "embedding_corpus_digest_invalid", + "embedding.corpus_digest must be a lowercase 64-character hex SHA-256 digest", + manifest.embedding.corpus_digest.as_deref(), + report, + ); + validate_optional_sha256( + "embedding_matrix_digest_invalid", + "embedding.embedding_matrix_digest must be a lowercase 64-character hex SHA-256 digest", + manifest.embedding.embedding_matrix_digest.as_deref(), + report, + ); + validate_optional_non_empty( + "embedding_normalization_empty", + "embedding.normalization must be non-empty when present", + manifest.embedding.normalization.as_deref(), + report, + ); + if let Some(build) = &manifest.build { if build.invocation_id.trim().is_empty() { report.error( @@ -268,6 +303,30 @@ fn validate_manifest_shape(manifest: &IndexManifest, report: &mut VerificationRe "build.builder_id must be non-empty", ); } + validate_optional_non_empty( + "build_source_repo_empty", + "build.source_repo must be non-empty when present", + build.source_repo.as_deref(), + report, + ); + validate_optional_non_empty( + "build_source_commit_empty", + "build.source_commit must be non-empty when present", + build.source_commit.as_deref(), + report, + ); + validate_optional_non_empty( + "build_ci_provider_empty", + "build.ci_provider must be non-empty when present", + build.ci_provider.as_deref(), + report, + ); + validate_optional_non_empty( + "build_ci_run_id_empty", + "build.ci_run_id must be non-empty when present", + build.ci_run_id.as_deref(), + report, + ); } for key in manifest.extensions.keys() { @@ -280,6 +339,28 @@ fn validate_manifest_shape(manifest: &IndexManifest, report: &mut VerificationRe } } +fn validate_optional_non_empty( + code: &str, + message: &str, + value: Option<&str>, + report: &mut VerificationReport, +) { + if value.is_some_and(|value| value.trim().is_empty()) { + report.error(code, message); + } +} + +fn validate_optional_sha256( + code: &str, + message: &str, + value: Option<&str>, + report: &mut VerificationReport, +) { + if value.is_some_and(|value| !is_sha256_hex(value)) { + report.error(code, message); + } +} + fn artifact_kind_matches_params(kind: ManifestIndexKind, params: &ManifestIndexParams) -> bool { matches!( (kind, params), @@ -645,6 +726,14 @@ pub struct Artifact { pub struct Embedding { pub model: String, pub dim: usize, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub model_revision: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub corpus_digest: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub embedding_matrix_digest: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub normalization: Option, } #[derive(Clone, Debug, Serialize, Deserialize)] @@ -653,6 +742,14 @@ pub struct BuildInfo { pub invocation_id: String, #[serde(default, skip_serializing_if = "Option::is_none")] pub builder_id: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub source_repo: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub source_commit: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub ci_provider: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub ci_run_id: Option, } #[derive(Clone, Debug, Serialize, Deserialize)] @@ -861,11 +958,33 @@ pub enum CreateRowIdentity { Jsonl(PathBuf), } +#[derive(Clone, Debug, Default)] +pub struct CreateManifestOptions { + pub allow_absolute_paths: bool, + pub allow_path_escape: bool, +} + pub fn create_manifest_for_index( index_path: impl AsRef, row_identity: CreateRowIdentity, embedding_model: impl Into, out_path: impl AsRef, +) -> Result { + create_manifest_for_index_with_options( + index_path, + row_identity, + embedding_model, + out_path, + CreateManifestOptions::default(), + ) +} + +pub fn create_manifest_for_index_with_options( + index_path: impl AsRef, + row_identity: CreateRowIdentity, + embedding_model: impl Into, + out_path: impl AsRef, + options: CreateManifestOptions, ) -> Result { let index_path = index_path.as_ref(); let out_path = out_path.as_ref(); @@ -879,7 +998,7 @@ pub fn create_manifest_for_index( let metadata = probe_index_metadata(index_path)?; let index_hash = sha256_file(index_path)?; let artifact = Artifact { - path: manifest_relative_path(index_path, out_base), + path: manifest_path_for_create(index_path, out_base, &options, "artifact")?, sha256: index_hash.sha256, kind: ManifestIndexKind::from_core(metadata.kind), format_version: metadata.format_version, @@ -915,7 +1034,7 @@ pub fn create_manifest_for_index( ))); } RowIdentity::Jsonl { - path: manifest_relative_path(&path, out_base), + path: manifest_path_for_create(&path, out_base, &options, "row identity")?, sha256: row_hash.sha256, row_count: stats.row_count, id_kind: "uuid".to_string(), @@ -933,11 +1052,19 @@ pub fn create_manifest_for_index( embedding: Embedding { model: embedding_model.into(), dim: metadata.dim, + model_revision: None, + corpus_digest: None, + embedding_matrix_digest: None, + normalization: None, }, row_identity, build: Some(BuildInfo { invocation_id, builder_id: Some("ordvec-manifest".to_string()), + source_repo: None, + source_commit: None, + ci_provider: None, + ci_run_id: None, }), attestations: Vec::new(), extensions: BTreeMap::new(), @@ -1031,17 +1158,74 @@ fn validate_row_id_string( } } -fn manifest_relative_path(path: &Path, base_dir: &Path) -> String { - let canonical_path = fs::canonicalize(path); - let canonical_base = fs::canonicalize(base_dir); - if let (Ok(canonical_path), Ok(canonical_base)) = (canonical_path, canonical_base) { - if let Ok(relative) = canonical_path.strip_prefix(&canonical_base) { - if !relative.as_os_str().is_empty() { - return path_to_manifest_string(relative); - } +fn manifest_path_for_create( + path: &Path, + base_dir: &Path, + options: &CreateManifestOptions, + context: &str, +) -> Result { + let canonical_path = fs::canonicalize(path)?; + let canonical_base = fs::canonicalize(base_dir)?; + if let Ok(relative) = canonical_path.strip_prefix(&canonical_base) { + if !relative.as_os_str().is_empty() { + return Ok(path_to_manifest_string(relative)); + } + return Ok(".".to_string()); + } + + if !options.allow_path_escape { + return Err(ManifestError::invalid(format!( + "{context} path {} is outside manifest directory {}; use --allow-path-escape to create a manifest that requires non-default verification policy", + canonical_path.display(), + canonical_base.display() + ))); + } + + if let Some(relative) = relative_path_between(&canonical_base, &canonical_path) { + return Ok(path_to_manifest_string(&relative)); + } + + if options.allow_absolute_paths { + return Ok(path_to_manifest_string(&canonical_path)); + } + + Err(ManifestError::invalid(format!( + "{context} path {} cannot be expressed relative to manifest directory {}; use --allow-absolute-paths with --allow-path-escape", + canonical_path.display(), + canonical_base.display() + ))) +} + +fn relative_path_between(base: &Path, target: &Path) -> Option { + let base_components = base.components().collect::>(); + let target_components = target.components().collect::>(); + let mut common = 0usize; + while common < base_components.len() + && common < target_components.len() + && base_components[common] == target_components[common] + { + common += 1; + } + + if common == 0 { + return None; + } + + let mut relative = PathBuf::new(); + for component in &base_components[common..] { + if matches!(component, Component::Normal(_)) { + relative.push(".."); + } + } + for component in &target_components[common..] { + match component { + Component::Normal(part) => relative.push(part), + Component::CurDir => {} + Component::ParentDir => relative.push(".."), + Component::Prefix(_) | Component::RootDir => return None, } } - path_to_manifest_string(path) + Some(relative) } fn path_to_manifest_string(path: &Path) -> String { @@ -1098,11 +1282,14 @@ fn valid_extension_part(part: &str) -> bool { } fn is_sha256_hex(value: &str) -> bool { - value.len() == 64 && value.bytes().all(|b| b.is_ascii_hexdigit()) + value.len() == 64 + && value + .bytes() + .all(|b| b.is_ascii_digit() || matches!(b, b'a'..=b'f')) } fn hex_digest_eq(a: &str, b: &str) -> bool { - a.eq_ignore_ascii_case(b) + a == b } #[cfg(feature = "sqlite")] diff --git a/ordvec-manifest/src/main.rs b/ordvec-manifest/src/main.rs index 29dd426..5d1b412 100644 --- a/ordvec-manifest/src/main.rs +++ b/ordvec-manifest/src/main.rs @@ -1,7 +1,8 @@ use clap::{Parser, Subcommand}; use ordvec_manifest::{ - create_manifest_for_index, load_manifest_file, sha256_file, verify_manifest, - write_manifest_file, CreateRowIdentity, ManifestDocument, ManifestError, VerifyOptions, + create_manifest_for_index_with_options, load_manifest_file, sha256_file, verify_manifest, + write_manifest_file, CreateManifestOptions, CreateRowIdentity, ManifestDocument, ManifestError, + VerifyOptions, }; use serde_json::json; use std::fs; @@ -55,6 +56,10 @@ enum Commands { embedding_model: String, #[arg(long)] out: PathBuf, + #[arg(long)] + allow_absolute_paths: bool, + #[arg(long)] + allow_path_escape: bool, }, #[cfg(feature = "sqlite")] Sqlite { @@ -179,6 +184,8 @@ fn run() -> Result { row_id_is_identity, embedding_model, out, + allow_absolute_paths, + allow_path_escape, } => { let row_identity = match (row_map, row_id_is_identity) { (Some(_), true) => { @@ -197,7 +204,16 @@ fn run() -> Result { if let Some(parent) = out.parent().filter(|p| !p.as_os_str().is_empty()) { fs::create_dir_all(parent)?; } - let manifest = create_manifest_for_index(&index, row_identity, embedding_model, &out)?; + let manifest = create_manifest_for_index_with_options( + &index, + row_identity, + embedding_model, + &out, + CreateManifestOptions { + allow_absolute_paths, + allow_path_escape, + }, + )?; write_manifest_file(&manifest, &out)?; println!("{}", out.display()); Ok(0) @@ -264,7 +280,7 @@ fn run_sqlite(command: SqliteCommands) -> Result { force, )?; emit_report(&report, as_json)?; - Ok(if report.ok { + Ok(if report.ok || force { 0 } else { EXIT_VERIFICATION_FAILED diff --git a/ordvec-manifest/src/sqlite.rs b/ordvec-manifest/src/sqlite.rs index 3a22cac..3b14a12 100644 --- a/ordvec-manifest/src/sqlite.rs +++ b/ordvec-manifest/src/sqlite.rs @@ -51,9 +51,18 @@ pub fn activate( let mut conn = Connection::open(db_path).map_err(sqlite_err)?; init(&conn)?; let store_options = options.clone(); - let report = verify_manifest(document, options); - let cache_key = - cache_key_from_report(manifest_path.as_ref(), &report, document, &store_options)?; + let mut report = verify_manifest(document, options); + if !report.ok && force { + report.warnings.push(ReportIssue::new( + "sqlite_activation_forced", + "sqlite activation was forced even though verification failed", + )); + } + let cache_key = if !report.ok && force { + None + } else { + cache_key_from_report(manifest_path.as_ref(), &report, document, &store_options)? + }; store_report( &mut conn, document, diff --git a/ordvec-manifest/tests/manifest.rs b/ordvec-manifest/tests/manifest.rs index b1b2bb8..8722cc2 100644 --- a/ordvec-manifest/tests/manifest.rs +++ b/ordvec-manifest/tests/manifest.rs @@ -1,7 +1,8 @@ -use ordvec::RankQuant; +use ordvec::{Bitmap, Rank, RankQuant, SignBitmap}; use ordvec_manifest::{ - create_manifest_for_index, load_manifest_file, sha256_file, verify_manifest_with_base, - CreateRowIdentity, RowIdentity, VerifyOptions, + create_manifest_for_index, create_manifest_for_index_with_options, load_manifest_file, + sha256_file, verify_index_manifest, verify_manifest_with_base, CreateManifestOptions, + CreateRowIdentity, ManifestIndexParams, RowIdentity, VerifyOptions, }; use serde_json::json; use std::fs; @@ -18,6 +19,47 @@ fn write_index(dir: &Path) -> PathBuf { path } +#[derive(Clone, Copy)] +enum FixtureKind { + Rank, + RankQuant, + Bitmap, + SignBitmap, +} + +fn write_index_kind(dir: &Path, kind: FixtureKind) -> PathBuf { + match kind { + FixtureKind::Rank => { + let path = dir.join("index.tvr"); + let mut index = Rank::new(8); + index.add(&[ + 1.0, 3.0, 2.0, 4.0, 8.0, 7.0, 6.0, 5.0, 8.0, 6.0, 7.0, 5.0, 1.0, 2.0, 3.0, 4.0, + ]); + index.write(&path).unwrap(); + path + } + FixtureKind::RankQuant => write_index(dir), + FixtureKind::Bitmap => { + let path = dir.join("index.tvbm"); + let mut index = Bitmap::new(64, 16); + let docs: Vec = (0..128).map(|i| ((i * 17) % 31) as f32).collect(); + index.add(&docs); + index.write(&path).unwrap(); + path + } + FixtureKind::SignBitmap => { + let path = dir.join("index.tvsb"); + let mut index = SignBitmap::new(64); + let docs: Vec = (0usize..128) + .map(|i| if i.is_multiple_of(3) { 1.0 } else { -1.0 }) + .collect(); + index.add(&docs); + index.write(&path).unwrap(); + path + } + } +} + fn write_row_map(path: &Path, rows: &[(&str, Option<&str>)]) { let mut file = fs::File::create(path).unwrap(); for (row_id, (db_id, parent_id)) in rows.iter().enumerate() { @@ -45,25 +87,39 @@ fn identity_manifest(dir: &Path) -> (tempfile::TempDir, ordvec_manifest::IndexMa } #[test] -fn create_then_verify_identity_manifest() { +fn create_then_verify_identity_manifest_for_all_persisted_formats() { let temp = tempfile::tempdir().unwrap(); - let index = write_index(temp.path()); - let manifest_path = temp.path().join("manifest.json"); - let manifest = create_manifest_for_index( - &index, - CreateRowIdentity::RowIdIdentity, - "test-embedding", - &manifest_path, - ) - .unwrap(); + for (kind, expected) in [ + (FixtureKind::Rank, ordvec_manifest::ManifestIndexKind::Rank), + ( + FixtureKind::RankQuant, + ordvec_manifest::ManifestIndexKind::RankQuant, + ), + ( + FixtureKind::Bitmap, + ordvec_manifest::ManifestIndexKind::Bitmap, + ), + ( + FixtureKind::SignBitmap, + ordvec_manifest::ManifestIndexKind::SignBitmap, + ), + ] { + let case = tempfile::tempdir_in(temp.path()).unwrap(); + let index = write_index_kind(case.path(), kind); + let manifest_path = case.path().join("manifest.json"); + let manifest = create_manifest_for_index( + &index, + CreateRowIdentity::RowIdIdentity, + "test-embedding", + &manifest_path, + ) + .unwrap(); - let report = verify_manifest_with_base(manifest, temp.path(), VerifyOptions::default()); - assert!(report.ok, "{:?}", report.errors); - assert_eq!(report.skipped_checks, ["attestations_absent"]); - assert_eq!( - report.artifact.metadata.unwrap().kind, - ordvec_manifest::ManifestIndexKind::RankQuant - ); + let report = verify_manifest_with_base(manifest, case.path(), VerifyOptions::default()); + assert!(report.ok, "{:?}", report.errors); + assert_eq!(report.skipped_checks, ["attestations_absent"]); + assert_eq!(report.artifact.metadata.unwrap().kind, expected); + } } #[test] @@ -72,11 +128,15 @@ fn create_manifest_creates_output_parent_for_programmatic_callers() { let index = write_index(temp.path()); let manifest_path = temp.path().join("nested").join("manifest.json"); - let manifest = create_manifest_for_index( + let manifest = create_manifest_for_index_with_options( &index, CreateRowIdentity::RowIdIdentity, "test-embedding", &manifest_path, + CreateManifestOptions { + allow_path_escape: true, + ..CreateManifestOptions::default() + }, ) .unwrap(); @@ -118,6 +178,42 @@ fn schema_rejects_unknown_fields_and_bad_extension_keys() { assert!(report.ok, "{:?}", report.errors); } +#[test] +fn schema_enforces_lowercase_sha256_and_optional_field_shapes() { + let root = tempfile::tempdir().unwrap(); + let (temp, mut manifest, _manifest_path) = identity_manifest(root.path()); + manifest.artifact.sha256 = manifest.artifact.sha256.to_ascii_uppercase(); + manifest.row_identity = RowIdentity::Jsonl { + path: "rows.jsonl".to_string(), + sha256: "A".repeat(64), + row_count: 2, + id_kind: "uuid".to_string(), + db: None, + }; + manifest.embedding.model_revision = Some("".to_string()); + manifest.embedding.corpus_digest = Some("A".repeat(64)); + manifest.embedding.embedding_matrix_digest = Some("not-a-digest".to_string()); + manifest.embedding.normalization = Some("".to_string()); + manifest.build.as_mut().unwrap().source_repo = Some("".to_string()); + + let report = verify_manifest_with_base(manifest, temp.path(), VerifyOptions::default()); + for code in [ + "artifact_sha256_invalid", + "row_identity_sha256_invalid", + "embedding_model_revision_empty", + "embedding_corpus_digest_invalid", + "embedding_matrix_digest_invalid", + "embedding_normalization_empty", + "build_source_repo_empty", + ] { + assert!( + report.errors.iter().any(|issue| issue.code == code), + "missing {code}: {:?}", + report.errors + ); + } +} + #[test] fn artifact_metadata_mismatches_are_reported_with_stable_codes() { let root = tempfile::tempdir().unwrap(); @@ -131,6 +227,51 @@ fn artifact_metadata_mismatches_are_reported_with_stable_codes() { .errors .iter() .any(|issue| issue.code == "artifact_dim_mismatch")); + + let (temp, mut manifest, _manifest_path) = identity_manifest(root.path()); + manifest.artifact.params = ManifestIndexParams::RankQuant { bits: 4 }; + let report = verify_manifest_with_base(manifest, temp.path(), VerifyOptions::default()); + assert!(report + .errors + .iter() + .any(|issue| issue.code == "artifact_params_mismatch")); + + let case = tempfile::tempdir_in(root.path()).unwrap(); + let bitmap = write_index_kind(case.path(), FixtureKind::Bitmap); + let manifest_path = case.path().join("bitmap.manifest.json"); + let mut manifest = create_manifest_for_index( + &bitmap, + CreateRowIdentity::RowIdIdentity, + "test-embedding", + &manifest_path, + ) + .unwrap(); + manifest.artifact.params = ManifestIndexParams::Bitmap { n_top: 8 }; + let report = verify_manifest_with_base(manifest, case.path(), VerifyOptions::default()); + assert!(report + .errors + .iter() + .any(|issue| issue.code == "artifact_params_mismatch")); +} + +#[test] +fn missing_artifact_and_row_count_mismatch_are_reported() { + let root = tempfile::tempdir().unwrap(); + let (temp, mut manifest, _manifest_path) = identity_manifest(root.path()); + manifest.row_identity = RowIdentity::RowIdIdentity { row_count: 1 }; + let report = verify_manifest_with_base(manifest.clone(), temp.path(), VerifyOptions::default()); + assert!(report + .errors + .iter() + .any(|issue| issue.code == "artifact_row_count_mismatch")); + + manifest.row_identity = RowIdentity::RowIdIdentity { row_count: 2 }; + fs::remove_file(temp.path().join(&manifest.artifact.path)).unwrap(); + let report = verify_manifest_with_base(manifest, temp.path(), VerifyOptions::default()); + assert!(report + .errors + .iter() + .any(|issue| issue.code == "artifact_path_unavailable")); } #[test] @@ -140,11 +281,15 @@ fn path_policy_rejects_escapes_and_absolute_paths_by_default() { fs::create_dir(&base).unwrap(); let index = write_index(root.path()); let manifest_path = base.join("manifest.json"); - let mut manifest = create_manifest_for_index( + let mut manifest = create_manifest_for_index_with_options( &index, CreateRowIdentity::RowIdIdentity, "test-embedding", &manifest_path, + CreateManifestOptions { + allow_path_escape: true, + ..CreateManifestOptions::default() + }, ) .unwrap(); @@ -197,11 +342,15 @@ fn symlink_escape_reports_observed_canonical_path() { let index = write_index(&outside); symlink(&index, base.join("link.tvrq")).unwrap(); let manifest_path = base.join("manifest.json"); - let mut manifest = create_manifest_for_index( + let mut manifest = create_manifest_for_index_with_options( &index, CreateRowIdentity::RowIdIdentity, "test-embedding", &manifest_path, + CreateManifestOptions { + allow_path_escape: true, + ..CreateManifestOptions::default() + }, ) .unwrap(); manifest.artifact.path = "link.tvrq".to_string(); @@ -394,6 +543,110 @@ fn cli_create_verify_and_exit_codes() { assert_eq!(output.status.code(), Some(2)); } +#[test] +fn create_outside_manifest_dir_requires_explicit_path_policy() { + let temp = tempfile::tempdir().unwrap(); + let outside = temp.path().join("outside"); + let manifests = temp.path().join("manifests"); + fs::create_dir(&outside).unwrap(); + let index = write_index(&outside); + let manifest_path = manifests.join("manifest.json"); + + let err = create_manifest_for_index( + &index, + CreateRowIdentity::RowIdIdentity, + "test-embedding", + &manifest_path, + ) + .unwrap_err(); + assert!(err.to_string().contains("outside manifest directory")); + + let bin = env!("CARGO_BIN_EXE_ordvec-manifest"); + let output = Command::new(bin) + .args([ + "create", + "--index", + index.to_str().unwrap(), + "--row-id-is-identity", + "--embedding-model", + "test-embedding", + "--out", + manifest_path.to_str().unwrap(), + ]) + .output() + .unwrap(); + assert_eq!(output.status.code(), Some(2)); + + let output = Command::new(bin) + .args([ + "create", + "--index", + index.to_str().unwrap(), + "--row-id-is-identity", + "--embedding-model", + "test-embedding", + "--out", + manifest_path.to_str().unwrap(), + "--allow-path-escape", + ]) + .output() + .unwrap(); + assert!( + output.status.success(), + "{}", + String::from_utf8_lossy(&output.stderr) + ); + + let output = Command::new(bin) + .args(["verify", "--manifest", manifest_path.to_str().unwrap()]) + .output() + .unwrap(); + assert_eq!(output.status.code(), Some(1)); + + let output = Command::new(bin) + .args([ + "verify", + "--manifest", + manifest_path.to_str().unwrap(), + "--allow-path-escape", + ]) + .output() + .unwrap(); + assert!( + output.status.success(), + "{}", + String::from_utf8_lossy(&output.stderr) + ); +} + +#[test] +fn verify_index_manifest_uses_explicit_index_override() { + let temp = tempfile::tempdir().unwrap(); + let index = write_index(temp.path()); + let manifest_path = temp.path().join("manifest.json"); + let mut manifest = create_manifest_for_index( + &index, + CreateRowIdentity::RowIdIdentity, + "test-embedding", + &manifest_path, + ) + .unwrap(); + manifest.artifact.path = "missing.tvrq".to_string(); + fs::write( + &manifest_path, + serde_json::to_string_pretty(&manifest).unwrap(), + ) + .unwrap(); + + let report = verify_index_manifest( + PathBuf::from("index.tvrq"), + &manifest_path, + VerifyOptions::default(), + ) + .unwrap(); + assert!(report.ok, "{:?}", report.errors); +} + #[cfg(feature = "sqlite")] #[test] fn sqlite_cache_is_explicit_and_activation_reverifies_by_default() { @@ -498,6 +751,10 @@ fn sqlite_cache_is_explicit_and_activation_reverifies_by_default() { ) .unwrap(); assert!(!forced.ok); + assert!(forced + .warnings + .iter() + .any(|issue| issue.code == "sqlite_activation_forced")); let bin = env!("CARGO_BIN_EXE_ordvec-manifest"); let output = Command::new(bin) @@ -509,8 +766,16 @@ fn sqlite_cache_is_explicit_and_activation_reverifies_by_default() { "--manifest", manifest_path.to_str().unwrap(), "--force", + "--json", ]) .output() .unwrap(); - assert_eq!(output.status.code(), Some(1)); + assert_eq!(output.status.code(), Some(0)); + let forced_report: ordvec_manifest::VerificationReport = + serde_json::from_slice(&output.stdout).unwrap(); + assert!(!forced_report.ok); + assert!(forced_report + .warnings + .iter() + .any(|issue| issue.code == "sqlite_activation_forced")); } From 96e45321d221d37cf2808bac16fc6a029b793b57 Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Fri, 29 May 2026 10:14:28 -0500 Subject: [PATCH 4/6] add calibration profile manifest binding Signed-off-by: Nelson Spence --- CHANGELOG.md | 3 + docs/INDEX_PROVENANCE.md | 12 + ordvec-manifest/README.md | 17 +- ordvec-manifest/src/lib.rs | 617 ++++++++++++++++++++++++++++++ ordvec-manifest/src/main.rs | 32 +- ordvec-manifest/src/sqlite.rs | 64 +++- ordvec-manifest/tests/manifest.rs | 453 +++++++++++++++++++++- 7 files changed, 1184 insertions(+), 14 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e565531..36f5cbc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 JSON schema, artifact and row-identity verification, attestation shape checks, a CLI, and optional SQLite cache/audit support with one active manifest pointer. +- Added optional typed calibration profile references to the v1 manifest + schema, with path/hash/identity/compatibility verification but no statistical + computation. ### Documentation diff --git a/docs/INDEX_PROVENANCE.md b/docs/INDEX_PROVENANCE.md index 91795f8..ec4af7a 100644 --- a/docs/INDEX_PROVENANCE.md +++ b/docs/INDEX_PROVENANCE.md @@ -59,13 +59,25 @@ The manifest verifier checks: `SignBitmap`) without allocating the payload; - declared dimension, vector count, bytes-per-vector, format version, and format parameters against the probed metadata; +- the `embedding` block as the encoder/vector representation that produced + the index artifact; - row identity, either explicit `row_id_identity` or a strict JSONL row map whose `row_id` equals the zero-based line number and whose `db_id` is non-empty, NUL-free, and unique by default; +- optional `calibration` profile references, checking profile identity, + path/hash integrity, encoder identity, and ordinalization compatibility; - attestation **shape** only: predicate type, builder id when present, and at least one subject SHA-256 matching the artifact when attestations are supplied. +When present, `calibration` binds an index artifact to a hashed ordinal profile +used to interpret overlap, bucket, sign, or rank evidence under a calibrated +null. The verifier checks profile identity, path/hash integrity, encoder +identity, and ordinalization compatibility; it does not judge whether the null +model is scientifically adequate and does not compute likelihood ratios or tail +probabilities. Calibration profiles must match the encoder identity declared by +`embedding`; cross-encoder calibration is rejected by default. + Recipes that consume sidecar manifests should run the verifier first, then load/search/rerank only if verification succeeds. diff --git a/ordvec-manifest/README.md b/ordvec-manifest/README.md index b9167fe..495f4a8 100644 --- a/ordvec-manifest/README.md +++ b/ordvec-manifest/README.md @@ -2,9 +2,10 @@ Repo-local, publish=false sidecar verifier for ordvec index manifests. -It verifies index bytes, probed header metadata, row identity, and attestation -shape before a caller loads an ordvec index. It does not sign artifacts, manage -keys, call networks, mutate index files, decide deployment trust policy, or +It verifies index bytes, probed header metadata, row identity, optional +calibration profile references, and attestation shape before a caller loads an +ordvec index. It does not sign artifacts, manage keys, call networks, mutate +index files, decide deployment trust policy, compute calibration statistics, or change the C ABI. ```sh @@ -28,8 +29,8 @@ verify time. With `--features sqlite`, the `sqlite verify` and `sqlite activate` subcommands add a local cache/audit log plus one active-manifest pointer. This is not a full named registry. `sqlite verify --use-cache` reuses only reports whose -manifest, verification options, artifact bytes, and row-identity bytes still -match; otherwise it runs fresh verification and stores a new report. `sqlite -activate --force` writes the active pointer even when verification fails, emits -a `sqlite_activation_forced` warning in JSON output, and exits zero because it -did mutate activation state. +manifest, verification options, artifact bytes, row-identity bytes, and +calibration profile bytes still match; otherwise it runs fresh verification and +stores a new report. `sqlite activate --force` writes the active pointer even +when verification fails, emits a `sqlite_activation_forced` warning in JSON +output, and exits zero because it did mutate activation state. diff --git a/ordvec-manifest/src/lib.rs b/ordvec-manifest/src/lib.rs index 5798814..991f10c 100644 --- a/ordvec-manifest/src/lib.rs +++ b/ordvec-manifest/src/lib.rs @@ -13,6 +13,7 @@ use std::path::{Component, Path, PathBuf}; use uuid::Uuid; pub const SCHEMA_VERSION: &str = "ordvec.index_manifest.v1"; +pub const CALIBRATION_SCHEMA_VERSION: &str = "ordvec.calibration.v1"; #[derive(Debug)] pub enum ManifestError { @@ -162,6 +163,7 @@ pub fn verify_manifest(document: &ManifestDocument, options: VerifyOptions) -> V } verify_row_identity(document, &options, &mut report); + verify_calibration(document, &options, &mut report); verify_attestations(&document.manifest, &mut report); report.ok = report.errors.is_empty(); @@ -527,6 +529,490 @@ fn verify_row_identity( } } +fn verify_calibration( + document: &ManifestDocument, + options: &VerifyOptions, + report: &mut VerificationReport, +) { + let Some(calibration) = &document.manifest.calibration else { + return; + }; + + report.calibration.present = true; + report.calibration.schema_version = Some(calibration.schema_version.clone()); + report.calibration.profile_id = Some(calibration.profile_id.clone()); + report.calibration.calibrated_for_model = Some(calibration.calibrated_for.model.clone()); + report.calibration.ordinalization = Some(calibration.ordinalization.label().to_string()); + report.calibration.null_model = Some(calibration.null_model.label().to_string()); + + validate_calibration_shape(calibration, report); + validate_calibration_encoder(calibration, &document.manifest.embedding, report); + validate_calibration_ordinalization(calibration, &document.manifest.artifact, report); + validate_calibration_profile( + calibration, + &document.manifest.artifact, + &document.base_dir, + options, + report, + ); +} + +fn validate_calibration_shape( + calibration: &CalibrationProfileRef, + report: &mut VerificationReport, +) { + if calibration.schema_version != CALIBRATION_SCHEMA_VERSION { + report.error( + "calibration_schema_version_unsupported", + format!( + "calibration.schema_version must be {CALIBRATION_SCHEMA_VERSION}, got {}", + calibration.schema_version + ), + ); + } + if calibration.profile_id.trim().is_empty() { + report.error( + "calibration_profile_id_empty", + "calibration.profile_id must be non-empty", + ); + } + if calibration + .created_at + .as_ref() + .is_some_and(|created_at| DateTime::parse_from_rfc3339(created_at).is_err()) + { + report.error( + "calibration_created_at_invalid", + "calibration.created_at must parse as RFC3339 when present", + ); + } + if calibration.calibrated_for.model.trim().is_empty() { + report.error( + "calibration_encoder_model_empty", + "calibration.calibrated_for.model must be non-empty", + ); + } + if calibration.calibrated_for.dim == 0 { + report.error( + "calibration_encoder_dim_zero", + "calibration.calibrated_for.dim must be greater than zero", + ); + } + validate_optional_non_empty( + "calibration_encoder_model_revision_empty", + "calibration.calibrated_for.model_revision must be non-empty when present", + calibration.calibrated_for.model_revision.as_deref(), + report, + ); + validate_optional_non_empty( + "calibration_encoder_normalization_empty", + "calibration.calibrated_for.normalization must be non-empty when present", + calibration.calibrated_for.normalization.as_deref(), + report, + ); + if calibration.ordinalization.dim() == 0 { + report.error( + "calibration_ordinalization_dim_zero", + "calibration.ordinalization.dim must be greater than zero", + ); + } + match &calibration.ordinalization { + CalibrationOrdinalization::TopK { k, .. } if *k == 0 => { + report.error( + "calibration_ordinalization_artifact_mismatch", + "calibration top_k.k must be greater than zero", + ); + } + CalibrationOrdinalization::Bucket { bits, .. } if !matches!(*bits, 1 | 2 | 4) => { + report.error( + "calibration_ordinalization_artifact_mismatch", + "calibration bucket.bits must be 1, 2, or 4", + ); + } + CalibrationOrdinalization::CallerDefined { name, .. } if name.trim().is_empty() => { + report.error( + "calibration_ordinalization_artifact_mismatch", + "calibration caller_defined.name must be non-empty", + ); + } + _ => {} + } + match &calibration.null_model { + NullModelSpec::EmpiricalTailTable { statistic } if statistic.trim().is_empty() => { + report.error( + "calibration_null_statistic_empty", + "calibration.null_model.statistic must be non-empty", + ); + } + NullModelSpec::CallerDefined { + name, + parameterization, + } => { + if name.trim().is_empty() { + report.error( + "calibration_null_name_empty", + "calibration.null_model.name must be non-empty", + ); + } + validate_optional_non_empty( + "calibration_null_parameterization_empty", + "calibration.null_model.parameterization must be non-empty when present", + parameterization.as_deref(), + report, + ); + } + _ => {} + } +} + +fn validate_calibration_encoder( + calibration: &CalibrationProfileRef, + embedding: &Embedding, + report: &mut VerificationReport, +) { + if calibration.calibrated_for.model != embedding.model { + report.error( + "calibration_encoder_model_mismatch", + format!( + "calibration model {:?} does not match embedding.model {:?}", + calibration.calibrated_for.model, embedding.model + ), + ); + } + if calibration.calibrated_for.dim != embedding.dim { + report.error( + "calibration_encoder_dim_mismatch", + format!( + "calibration dim {} does not match embedding.dim {}", + calibration.calibrated_for.dim, embedding.dim + ), + ); + } + compare_optional_identity( + "calibration_encoder_model_revision_mismatch", + "model_revision", + embedding.model_revision.as_deref(), + calibration.calibrated_for.model_revision.as_deref(), + report, + ); + compare_optional_identity( + "calibration_encoder_normalization_mismatch", + "normalization", + embedding.normalization.as_deref(), + calibration.calibrated_for.normalization.as_deref(), + report, + ); +} + +fn compare_optional_identity( + code: &str, + field: &str, + embedding_value: Option<&str>, + calibration_value: Option<&str>, + report: &mut VerificationReport, +) { + match (embedding_value, calibration_value) { + (Some(expected), Some(observed)) if expected == observed => {} + (None, None) => {} + _ => report.error( + code, + format!("calibration encoder {field} does not match embedding.{field}"), + ), + } +} + +fn validate_calibration_ordinalization( + calibration: &CalibrationProfileRef, + artifact: &Artifact, + report: &mut VerificationReport, +) { + if calibration.ordinalization.dim() != artifact.dim { + report.error( + "calibration_ordinalization_dim_mismatch", + format!( + "calibration ordinalization dim {} does not match artifact.dim {}", + calibration.ordinalization.dim(), + artifact.dim + ), + ); + } + + let compatible = match (artifact.kind, &artifact.params, &calibration.ordinalization) { + ( + ManifestIndexKind::Bitmap, + ManifestIndexParams::Bitmap { n_top }, + CalibrationOrdinalization::TopK { k, .. }, + ) => k == n_top, + ( + ManifestIndexKind::RankQuant, + ManifestIndexParams::RankQuant { bits }, + CalibrationOrdinalization::Bucket { + bits: calibrated_bits, + .. + }, + ) => calibrated_bits == bits, + ( + ManifestIndexKind::SignBitmap, + ManifestIndexParams::SignBitmap, + CalibrationOrdinalization::Sign { .. }, + ) => true, + ( + ManifestIndexKind::Rank, + ManifestIndexParams::Rank, + CalibrationOrdinalization::RankPosition { .. } + | CalibrationOrdinalization::CallerDefined { .. }, + ) => true, + _ => false, + }; + + if !compatible { + report.error( + "calibration_ordinalization_artifact_mismatch", + "calibration.ordinalization is incompatible with artifact.kind/artifact.params", + ); + } +} + +fn validate_calibration_profile( + calibration: &CalibrationProfileRef, + artifact: &Artifact, + base_dir: &Path, + options: &VerifyOptions, + report: &mut VerificationReport, +) { + if matches!( + &calibration.null_model, + NullModelSpec::UniformHypergeometric + ) { + if calibration.profile.is_some() { + report.error( + "calibration_profile_unexpected", + "uniform_hypergeometric calibration must not include a profile artifact", + ); + } + return; + } + + let Some(profile) = &calibration.profile else { + report.error( + "calibration_profile_required", + "non-uniform calibration requires a profile artifact", + ); + return; + }; + + report.calibration.profile_manifest_path = Some(profile.path.clone()); + if profile.path.trim().is_empty() { + report.error( + "calibration_profile_path_empty", + "calibration.profile.path must be non-empty", + ); + } + if !is_sha256_hex(&profile.sha256) { + report.error( + "calibration_profile_sha256_invalid", + "calibration.profile.sha256 must be a lowercase 64-character hex SHA-256 digest", + ); + } + if profile.file_size_bytes == 0 { + report.error( + "calibration_profile_file_size_zero", + "calibration.profile.file_size_bytes must be greater than zero", + ); + } + if profile.dim != artifact.dim { + report.error( + "calibration_profile_dim_mismatch", + format!( + "calibration profile dim {} does not match artifact.dim {}", + profile.dim, artifact.dim + ), + ); + } + if profile.sample_count == 0 { + report.error( + "calibration_profile_sample_count_zero", + "calibration.profile.sample_count must be greater than zero", + ); + } + validate_optional_source_digest(profile.source_digest.as_deref(), report); + validate_calibration_parameterization(calibration, profile, report); + validate_calibration_profile_shape(profile, &calibration.ordinalization, report); + + if !profile.path.trim().is_empty() { + let path = PathBuf::from(&profile.path); + if let Some(resolved) = resolve_existing_path( + &path, + base_dir, + options, + "calibration_profile", + &mut report.errors, + ) { + report.calibration.profile_canonical_path = + Some(path_to_display(&resolved.canonical_path)); + match sha256_file(&resolved.resolved_path) { + Ok(hash) => { + report.calibration.profile_sha256 = Some(hash.sha256.clone()); + report.calibration.profile_size_bytes = Some(hash.size_bytes); + if !hex_digest_eq(&hash.sha256, &profile.sha256) { + report.error( + "calibration_profile_sha256_mismatch", + format!( + "calibration profile SHA-256 was {}, manifest declares {}", + hash.sha256, profile.sha256 + ), + ); + } + if hash.size_bytes != profile.file_size_bytes { + report.error( + "calibration_profile_file_size_mismatch", + format!( + "calibration profile size was {}, manifest declares {}", + hash.size_bytes, profile.file_size_bytes + ), + ); + } + } + Err(err) => report.error( + "calibration_profile_hash_failed", + format!("failed to hash calibration profile: {err}"), + ), + } + } + } +} + +fn validate_optional_source_digest(value: Option<&str>, report: &mut VerificationReport) { + let Some(value) = value else { + return; + }; + let Some(digest) = value.strip_prefix("sha256:") else { + report.error( + "calibration_profile_source_digest_invalid", + "calibration.profile.source_digest must be sha256:", + ); + return; + }; + if !is_sha256_hex(digest) { + report.error( + "calibration_profile_source_digest_invalid", + "calibration.profile.source_digest must be sha256:", + ); + } +} + +fn validate_calibration_parameterization( + calibration: &CalibrationProfileRef, + profile: &ProfileArtifactRef, + report: &mut VerificationReport, +) { + match &calibration.null_model { + NullModelSpec::WeightedMarginalProfile { parameterization } + if *parameterization != profile.parameterization => + { + report.error( + "calibration_null_parameterization_mismatch", + format!( + "null_model parameterization {:?} does not match profile parameterization {:?}", + parameterization, profile.parameterization + ), + ); + } + NullModelSpec::EmpiricalTailTable { .. } + if profile.parameterization != ProfileParameterization::EmpiricalTailTable => + { + report.error( + "calibration_null_parameterization_mismatch", + "empirical_tail_table null_model requires empirical_tail_table profile parameterization", + ); + } + _ => {} + } +} + +fn validate_calibration_profile_shape( + profile: &ProfileArtifactRef, + ordinalization: &CalibrationOrdinalization, + report: &mut VerificationReport, +) { + if profile.format.trim().is_empty() { + report.error( + "calibration_profile_format_empty", + "calibration.profile.format must be non-empty", + ); + } + + if profile.shape.is_empty() { + return; + } + + if let Some(expected) = expected_profile_shape(profile.parameterization, ordinalization) { + if profile.shape != expected { + report.error( + "calibration_profile_shape_mismatch", + format!( + "calibration profile shape {:?} does not match expected {:?}", + profile.shape, expected + ), + ); + } + } + + let bytes_per_value = match profile.format.as_str() { + "raw_f64_le" => Some(8u64), + "raw_f32_le" => Some(4u64), + _ => None, + }; + let Some(bytes_per_value) = bytes_per_value else { + return; + }; + let Some(values) = profile + .shape + .iter() + .try_fold(1u64, |acc, value| acc.checked_mul(*value as u64)) + else { + report.error( + "calibration_profile_shape_mismatch", + "calibration.profile.shape product overflows u64", + ); + return; + }; + let Some(expected_bytes) = values.checked_mul(bytes_per_value) else { + report.error( + "calibration_profile_shape_mismatch", + "calibration.profile.shape byte size overflows u64", + ); + return; + }; + if profile.file_size_bytes != expected_bytes { + report.error( + "calibration_profile_file_size_mismatch", + format!( + "calibration profile size {} does not match shape/format size {}", + profile.file_size_bytes, expected_bytes + ), + ); + } +} + +fn expected_profile_shape( + parameterization: ProfileParameterization, + ordinalization: &CalibrationOrdinalization, +) -> Option> { + match parameterization { + ProfileParameterization::MarginalTopKFrequency => Some(vec![ordinalization.dim()]), + ProfileParameterization::SignFrequency => Some(vec![ordinalization.dim()]), + ProfileParameterization::BucketFrequency => match ordinalization { + CalibrationOrdinalization::Bucket { dim, bits } => Some(vec![*dim, 1usize << *bits]), + _ => None, + }, + ProfileParameterization::RankPositionFrequency => { + Some(vec![ordinalization.dim(), ordinalization.dim()]) + } + ProfileParameterization::EmpiricalTailTable => None, + } +} + fn verify_attestations(manifest: &IndexManifest, report: &mut VerificationReport) { if manifest.attestations.is_empty() { report @@ -698,6 +1184,8 @@ pub struct IndexManifest { pub created_at: String, pub artifact: Artifact, pub embedding: Embedding, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub calibration: Option, pub row_identity: RowIdentity, #[serde(default, skip_serializing_if = "Option::is_none")] pub build: Option, @@ -736,6 +1224,118 @@ pub struct Embedding { pub normalization: Option, } +#[derive(Clone, Debug, Serialize, Deserialize)] +#[serde(deny_unknown_fields)] +pub struct CalibrationProfileRef { + pub schema_version: String, + pub profile_id: String, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub created_at: Option, + pub calibrated_for: EncoderSpec, + pub ordinalization: CalibrationOrdinalization, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub profile: Option, + pub null_model: NullModelSpec, +} + +#[derive(Clone, Debug, Serialize, Deserialize)] +#[serde(deny_unknown_fields)] +pub struct EncoderSpec { + pub model: String, + pub dim: usize, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub model_revision: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub normalization: Option, +} + +#[derive(Clone, Debug, Serialize, Deserialize)] +#[serde(tag = "kind", rename_all = "snake_case", deny_unknown_fields)] +pub enum CalibrationOrdinalization { + TopK { dim: usize, k: usize }, + Bucket { dim: usize, bits: u8 }, + Sign { dim: usize }, + RankPosition { dim: usize }, + CallerDefined { dim: usize, name: String }, +} + +impl CalibrationOrdinalization { + pub fn dim(&self) -> usize { + match self { + Self::TopK { dim, .. } + | Self::Bucket { dim, .. } + | Self::Sign { dim } + | Self::RankPosition { dim } + | Self::CallerDefined { dim, .. } => *dim, + } + } + + pub fn label(&self) -> &'static str { + match self { + Self::TopK { .. } => "top_k", + Self::Bucket { .. } => "bucket", + Self::Sign { .. } => "sign", + Self::RankPosition { .. } => "rank_position", + Self::CallerDefined { .. } => "caller_defined", + } + } +} + +#[derive(Clone, Debug, Serialize, Deserialize)] +#[serde(deny_unknown_fields)] +pub struct ProfileArtifactRef { + pub path: String, + pub sha256: String, + pub file_size_bytes: u64, + pub dim: usize, + pub sample_count: usize, + pub parameterization: ProfileParameterization, + pub format: String, + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub shape: Vec, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub source_digest: Option, +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum ProfileParameterization { + #[serde(rename = "marginal_topk_frequency")] + MarginalTopKFrequency, + BucketFrequency, + SignFrequency, + RankPositionFrequency, + EmpiricalTailTable, +} + +#[derive(Clone, Debug, Serialize, Deserialize)] +#[serde(tag = "kind", rename_all = "snake_case", deny_unknown_fields)] +pub enum NullModelSpec { + UniformHypergeometric, + WeightedMarginalProfile { + parameterization: ProfileParameterization, + }, + EmpiricalTailTable { + statistic: String, + }, + CallerDefined { + name: String, + #[serde(default, skip_serializing_if = "Option::is_none")] + parameterization: Option, + }, +} + +impl NullModelSpec { + pub fn label(&self) -> &'static str { + match self { + Self::UniformHypergeometric => "uniform_hypergeometric", + Self::WeightedMarginalProfile { .. } => "weighted_marginal_profile", + Self::EmpiricalTailTable { .. } => "empirical_tail_table", + Self::CallerDefined { .. } => "caller_defined", + } + } +} + #[derive(Clone, Debug, Serialize, Deserialize)] #[serde(deny_unknown_fields)] pub struct BuildInfo { @@ -834,6 +1434,7 @@ pub struct VerificationReport { pub manifest_id: Option, pub artifact: ArtifactReport, pub row_identity: RowIdentityReport, + pub calibration: CalibrationReport, pub attestation_shape_checks: Vec, pub errors: Vec, pub warnings: Vec, @@ -848,6 +1449,7 @@ impl VerificationReport { manifest_id, artifact: ArtifactReport::default(), row_identity: RowIdentityReport::default(), + calibration: CalibrationReport::default(), attestation_shape_checks: Vec::new(), errors: Vec::new(), warnings: Vec::new(), @@ -880,6 +1482,20 @@ pub struct RowIdentityReport { pub validated_rows: Option, } +#[derive(Clone, Debug, Default, Serialize, Deserialize)] +pub struct CalibrationReport { + pub present: bool, + pub schema_version: Option, + pub profile_id: Option, + pub calibrated_for_model: Option, + pub ordinalization: Option, + pub null_model: Option, + pub profile_manifest_path: Option, + pub profile_canonical_path: Option, + pub profile_sha256: Option, + pub profile_size_bytes: Option, +} + #[derive(Clone, Debug, Serialize, Deserialize)] pub struct MetadataReport { pub kind: ManifestIndexKind, @@ -1057,6 +1673,7 @@ pub fn create_manifest_for_index_with_options( embedding_matrix_digest: None, normalization: None, }, + calibration: None, row_identity, build: Some(BuildInfo { invocation_id, diff --git a/ordvec-manifest/src/main.rs b/ordvec-manifest/src/main.rs index 5d1b412..a2c76c9 100644 --- a/ordvec-manifest/src/main.rs +++ b/ordvec-manifest/src/main.rs @@ -2,7 +2,7 @@ use clap::{Parser, Subcommand}; use ordvec_manifest::{ create_manifest_for_index_with_options, load_manifest_file, sha256_file, verify_manifest, write_manifest_file, CreateManifestOptions, CreateRowIdentity, ManifestDocument, ManifestError, - VerifyOptions, + NullModelSpec, ProfileParameterization, VerifyOptions, }; use serde_json::json; use std::fs; @@ -150,6 +150,7 @@ fn run() -> Result { println!("schema_version: {}", document.manifest.schema_version); println!("artifact: {}", document.manifest.artifact.path); println!("row_identity: {}", row_identity_label(&document)); + println!("calibration: {}", calibration_label(&document)); } Ok(0) } @@ -321,8 +322,35 @@ fn print_json(value: &impl serde::Serialize) -> Result<(), ManifestError> { } fn row_identity_label(document: &ManifestDocument) -> &'static str { - match document.manifest.row_identity { + match &document.manifest.row_identity { ordvec_manifest::RowIdentity::RowIdIdentity { .. } => "row_id_identity", ordvec_manifest::RowIdentity::Jsonl { .. } => "jsonl", } } + +fn calibration_label(document: &ManifestDocument) -> String { + let Some(calibration) = &document.manifest.calibration else { + return "absent".to_string(); + }; + match &calibration.null_model { + NullModelSpec::UniformHypergeometric => "uniform_hypergeometric".to_string(), + NullModelSpec::WeightedMarginalProfile { parameterization } => { + format!( + "weighted_marginal_profile / {}", + profile_parameterization_label(*parameterization) + ) + } + NullModelSpec::EmpiricalTailTable { .. } => "empirical_tail_table".to_string(), + NullModelSpec::CallerDefined { name, .. } => format!("caller_defined / {name}"), + } +} + +fn profile_parameterization_label(parameterization: ProfileParameterization) -> &'static str { + match parameterization { + ProfileParameterization::MarginalTopKFrequency => "marginal_topk_frequency", + ProfileParameterization::BucketFrequency => "bucket_frequency", + ProfileParameterization::SignFrequency => "sign_frequency", + ProfileParameterization::RankPositionFrequency => "rank_position_frequency", + ProfileParameterization::EmpiricalTailTable => "empirical_tail_table", + } +} diff --git a/ordvec-manifest/src/sqlite.rs b/ordvec-manifest/src/sqlite.rs index 3b14a12..8e37dff 100644 --- a/ordvec-manifest/src/sqlite.rs +++ b/ordvec-manifest/src/sqlite.rs @@ -107,6 +107,7 @@ fn init(conn: &Connection) -> Result<(), ManifestError> { options_sha256 TEXT, artifact_sha256 TEXT, row_identity_sha256 TEXT, + calibration_profile_sha256 TEXT, report_json TEXT NOT NULL ); INSERT INTO verification_reports( @@ -129,6 +130,7 @@ fn init(conn: &Connection) -> Result<(), ManifestError> { options_sha256 TEXT, artifact_sha256 TEXT, row_identity_sha256 TEXT, + calibration_profile_sha256 TEXT, report_json TEXT NOT NULL ); CREATE INDEX IF NOT EXISTS verification_reports_cache_idx @@ -138,6 +140,7 @@ fn init(conn: &Connection) -> Result<(), ManifestError> { options_sha256, artifact_sha256, row_identity_sha256, + calibration_profile_sha256, report_id ); CREATE TABLE IF NOT EXISTS active_manifest( @@ -171,8 +174,9 @@ fn store_report( options_sha256, artifact_sha256, row_identity_sha256, + calibration_profile_sha256, report_json - ) VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9)", + ) VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10)", params![ document.manifest.manifest_id, manifest_path.display().to_string(), @@ -182,6 +186,7 @@ fn store_report( cache_key.map(|key| key.options_sha256.as_str()), cache_key.map(|key| key.artifact_sha256.as_str()), cache_key.and_then(|key| key.row_identity_sha256.as_deref()), + cache_key.and_then(|key| key.calibration_profile_sha256.as_deref()), report_json, ], ) @@ -207,6 +212,10 @@ fn load_cached_report( (row_identity_sha256 IS NULL AND ?5 IS NULL) OR row_identity_sha256 = ?5 ) + AND ( + (calibration_profile_sha256 IS NULL AND ?6 IS NULL) + OR calibration_profile_sha256 = ?6 + ) ORDER BY report_id DESC LIMIT 1", params![ @@ -215,6 +224,7 @@ fn load_cached_report( cache_key.options_sha256.as_str(), cache_key.artifact_sha256.as_str(), cache_key.row_identity_sha256.as_deref(), + cache_key.calibration_profile_sha256.as_deref(), ], |row| row.get(0), ) @@ -231,6 +241,7 @@ struct CacheKey { options_sha256: String, artifact_sha256: String, row_identity_sha256: Option, + calibration_profile_sha256: Option, } #[derive(Serialize)] @@ -306,12 +317,14 @@ fn current_cache_key( } } }; + let calibration_profile_sha256 = current_calibration_profile_sha256(document, options)?; Ok(Some(CacheKey { manifest_sha256, options_sha256, artifact_sha256, row_identity_sha256, + calibration_profile_sha256, })) } @@ -339,14 +352,58 @@ fn cache_key_from_report( Some(sha256) } }; + let calibration_profile_sha256 = if document + .manifest + .calibration + .as_ref() + .and_then(|calibration| calibration.profile.as_ref()) + .is_some() + { + let Some(sha256) = report.calibration.profile_sha256.clone() else { + return Ok(None); + }; + Some(sha256) + } else { + None + }; Ok(Some(CacheKey { manifest_sha256, options_sha256, artifact_sha256, row_identity_sha256, + calibration_profile_sha256, })) } +fn current_calibration_profile_sha256( + document: &ManifestDocument, + options: &VerifyOptions, +) -> Result, ManifestError> { + let Some(profile) = document + .manifest + .calibration + .as_ref() + .and_then(|calibration| calibration.profile.as_ref()) + else { + return Ok(None); + }; + let path = PathBuf::from(&profile.path); + let mut path_errors = Vec::::new(); + let Some(resolved) = resolve_existing_path( + &path, + &document.base_dir, + options, + "calibration_profile", + &mut path_errors, + ) else { + return Ok(None); + }; + match sha256_file(&resolved.resolved_path) { + Ok(hash) => Ok(Some(hash.sha256)), + Err(_) => Ok(None), + } +} + fn sha256_bytes(bytes: &[u8]) -> String { let mut hasher = Sha256::new(); hasher.update(bytes); @@ -377,7 +434,10 @@ fn verification_reports_needs_migration(conn: &Connection) -> Result, _>>() .map_err(sqlite_err)?; Ok(!columns.iter().any(|column| column == "report_id") - || !columns.iter().any(|column| column == "manifest_sha256")) + || !columns.iter().any(|column| column == "manifest_sha256") + || !columns + .iter() + .any(|column| column == "calibration_profile_sha256")) } fn sqlite_err(err: rusqlite::Error) -> ManifestError { diff --git a/ordvec-manifest/tests/manifest.rs b/ordvec-manifest/tests/manifest.rs index 8722cc2..22a8e3c 100644 --- a/ordvec-manifest/tests/manifest.rs +++ b/ordvec-manifest/tests/manifest.rs @@ -1,8 +1,10 @@ use ordvec::{Bitmap, Rank, RankQuant, SignBitmap}; use ordvec_manifest::{ create_manifest_for_index, create_manifest_for_index_with_options, load_manifest_file, - sha256_file, verify_index_manifest, verify_manifest_with_base, CreateManifestOptions, - CreateRowIdentity, ManifestIndexParams, RowIdentity, VerifyOptions, + sha256_file, verify_index_manifest, verify_manifest_with_base, CalibrationOrdinalization, + CalibrationProfileRef, CreateManifestOptions, CreateRowIdentity, EncoderSpec, + ManifestIndexParams, NullModelSpec, ProfileArtifactRef, ProfileParameterization, RowIdentity, + VerifyOptions, CALIBRATION_SCHEMA_VERSION, }; use serde_json::json; use std::fs; @@ -86,6 +88,73 @@ fn identity_manifest(dir: &Path) -> (tempfile::TempDir, ordvec_manifest::IndexMa (temp, manifest, manifest_path) } +fn write_profile(path: &Path, size_bytes: usize) -> ordvec_manifest::FileHash { + fs::write(path, vec![0u8; size_bytes]).unwrap(); + sha256_file(path).unwrap() +} + +fn uniform_calibration( + manifest: &ordvec_manifest::IndexManifest, + ordinalization: CalibrationOrdinalization, +) -> CalibrationProfileRef { + CalibrationProfileRef { + schema_version: CALIBRATION_SCHEMA_VERSION.to_string(), + profile_id: "urn:uuid:7c66ad6e-bdde-49a8-b420-f1136d04f5bd".to_string(), + created_at: Some("2026-05-29T06:00:00Z".to_string()), + calibrated_for: EncoderSpec { + model: manifest.embedding.model.clone(), + dim: manifest.embedding.dim, + model_revision: manifest.embedding.model_revision.clone(), + normalization: manifest.embedding.normalization.clone(), + }, + ordinalization, + profile: None, + null_model: NullModelSpec::UniformHypergeometric, + } +} + +fn weighted_calibration( + manifest: &ordvec_manifest::IndexManifest, + path: impl Into, + hash: ordvec_manifest::FileHash, + ordinalization: CalibrationOrdinalization, + parameterization: ProfileParameterization, + shape: Vec, +) -> CalibrationProfileRef { + CalibrationProfileRef { + schema_version: CALIBRATION_SCHEMA_VERSION.to_string(), + profile_id: "urn:uuid:7c66ad6e-bdde-49a8-b420-f1136d04f5bd".to_string(), + created_at: Some("2026-05-29T06:00:00Z".to_string()), + calibrated_for: EncoderSpec { + model: manifest.embedding.model.clone(), + dim: manifest.embedding.dim, + model_revision: manifest.embedding.model_revision.clone(), + normalization: manifest.embedding.normalization.clone(), + }, + ordinalization, + profile: Some(ProfileArtifactRef { + path: path.into(), + sha256: hash.sha256, + file_size_bytes: hash.size_bytes, + dim: manifest.artifact.dim, + sample_count: 100, + parameterization, + format: "raw_f64_le".to_string(), + shape, + source_digest: None, + }), + null_model: NullModelSpec::WeightedMarginalProfile { parameterization }, + } +} + +fn error_codes(report: &ordvec_manifest::VerificationReport) -> Vec<&str> { + report + .errors + .iter() + .map(|issue| issue.code.as_str()) + .collect() +} + #[test] fn create_then_verify_identity_manifest_for_all_persisted_formats() { let temp = tempfile::tempdir().unwrap(); @@ -214,6 +283,320 @@ fn schema_enforces_lowercase_sha256_and_optional_field_shapes() { } } +#[test] +fn calibration_schema_shape_is_strict_and_optional() { + let root = tempfile::tempdir().unwrap(); + let (temp, manifest, _manifest_path) = identity_manifest(root.path()); + let report = verify_manifest_with_base(manifest.clone(), temp.path(), VerifyOptions::default()); + assert!(report.ok, "{:?}", report.errors); + assert!(!report.calibration.present); + + let mut with_unknown = manifest.clone(); + with_unknown.calibration = Some(uniform_calibration( + &with_unknown, + CalibrationOrdinalization::Bucket { + dim: with_unknown.artifact.dim, + bits: 2, + }, + )); + let mut value = serde_json::to_value(&with_unknown).unwrap(); + value["calibration"] + .as_object_mut() + .unwrap() + .insert("unknown".to_string(), json!(true)); + let parsed = serde_json::from_value::(value); + assert!(parsed.is_err(), "calibration must reject unknown fields"); + + let mut bad = manifest; + let mut calibration = uniform_calibration( + &bad, + CalibrationOrdinalization::Bucket { + dim: bad.artifact.dim, + bits: 2, + }, + ); + calibration.schema_version = "ordvec.calibration.v2".to_string(); + calibration.created_at = Some("not-rfc3339".to_string()); + bad.calibration = Some(calibration); + let report = verify_manifest_with_base(bad, temp.path(), VerifyOptions::default()); + for code in [ + "calibration_schema_version_unsupported", + "calibration_created_at_invalid", + ] { + assert!( + error_codes(&report).contains(&code), + "missing {code}: {:?}", + report.errors + ); + } +} + +#[test] +fn calibration_encoder_identity_must_match_embedding() { + let root = tempfile::tempdir().unwrap(); + let (temp, mut manifest, _manifest_path) = identity_manifest(root.path()); + manifest.embedding.model_revision = Some("rev-a".to_string()); + manifest.embedding.normalization = Some("l2".to_string()); + let mut calibration = uniform_calibration( + &manifest, + CalibrationOrdinalization::Bucket { + dim: manifest.artifact.dim, + bits: 2, + }, + ); + calibration.calibrated_for.model = "other-model".to_string(); + calibration.calibrated_for.dim += 1; + calibration.calibrated_for.model_revision = Some("rev-b".to_string()); + calibration.calibrated_for.normalization = Some("as_provided".to_string()); + manifest.calibration = Some(calibration); + + let report = verify_manifest_with_base(manifest, temp.path(), VerifyOptions::default()); + for code in [ + "calibration_encoder_model_mismatch", + "calibration_encoder_dim_mismatch", + "calibration_encoder_model_revision_mismatch", + "calibration_encoder_normalization_mismatch", + ] { + assert!( + error_codes(&report).contains(&code), + "missing {code}: {:?}", + report.errors + ); + } +} + +#[test] +fn calibration_ordinalization_matches_artifact_formats() { + let temp = tempfile::tempdir().unwrap(); + + let bitmap_case = tempfile::tempdir_in(temp.path()).unwrap(); + let bitmap = write_index_kind(bitmap_case.path(), FixtureKind::Bitmap); + let bitmap_manifest_path = bitmap_case.path().join("manifest.json"); + let mut bitmap_manifest = create_manifest_for_index( + &bitmap, + CreateRowIdentity::RowIdIdentity, + "test-embedding", + &bitmap_manifest_path, + ) + .unwrap(); + bitmap_manifest.calibration = Some(uniform_calibration( + &bitmap_manifest, + CalibrationOrdinalization::TopK { + dim: bitmap_manifest.artifact.dim, + k: 16, + }, + )); + let report = verify_manifest_with_base( + bitmap_manifest.clone(), + bitmap_case.path(), + VerifyOptions::default(), + ); + assert!(report.ok, "{:?}", report.errors); + bitmap_manifest.calibration = Some(uniform_calibration( + &bitmap_manifest, + CalibrationOrdinalization::TopK { + dim: bitmap_manifest.artifact.dim, + k: 8, + }, + )); + let report = verify_manifest_with_base( + bitmap_manifest, + bitmap_case.path(), + VerifyOptions::default(), + ); + assert!(error_codes(&report).contains(&"calibration_ordinalization_artifact_mismatch")); + + let rq_case = tempfile::tempdir_in(temp.path()).unwrap(); + let rank_quant = write_index_kind(rq_case.path(), FixtureKind::RankQuant); + let rq_manifest_path = rq_case.path().join("manifest.json"); + let mut rq_manifest = create_manifest_for_index( + &rank_quant, + CreateRowIdentity::RowIdIdentity, + "test-embedding", + &rq_manifest_path, + ) + .unwrap(); + rq_manifest.calibration = Some(uniform_calibration( + &rq_manifest, + CalibrationOrdinalization::Bucket { + dim: rq_manifest.artifact.dim, + bits: 2, + }, + )); + let report = verify_manifest_with_base( + rq_manifest.clone(), + rq_case.path(), + VerifyOptions::default(), + ); + assert!(report.ok, "{:?}", report.errors); + rq_manifest.calibration = Some(uniform_calibration( + &rq_manifest, + CalibrationOrdinalization::Bucket { + dim: rq_manifest.artifact.dim, + bits: 4, + }, + )); + let report = verify_manifest_with_base(rq_manifest, rq_case.path(), VerifyOptions::default()); + assert!(error_codes(&report).contains(&"calibration_ordinalization_artifact_mismatch")); + + let sign_case = tempfile::tempdir_in(temp.path()).unwrap(); + let sign = write_index_kind(sign_case.path(), FixtureKind::SignBitmap); + let sign_manifest_path = sign_case.path().join("manifest.json"); + let mut sign_manifest = create_manifest_for_index( + &sign, + CreateRowIdentity::RowIdIdentity, + "test-embedding", + &sign_manifest_path, + ) + .unwrap(); + sign_manifest.calibration = Some(uniform_calibration( + &sign_manifest, + CalibrationOrdinalization::Sign { + dim: sign_manifest.artifact.dim, + }, + )); + let report = + verify_manifest_with_base(sign_manifest, sign_case.path(), VerifyOptions::default()); + assert!(report.ok, "{:?}", report.errors); + + let rank_case = tempfile::tempdir_in(temp.path()).unwrap(); + let rank = write_index_kind(rank_case.path(), FixtureKind::Rank); + let rank_manifest_path = rank_case.path().join("manifest.json"); + let mut rank_manifest = create_manifest_for_index( + &rank, + CreateRowIdentity::RowIdIdentity, + "test-embedding", + &rank_manifest_path, + ) + .unwrap(); + rank_manifest.calibration = Some(uniform_calibration( + &rank_manifest, + CalibrationOrdinalization::RankPosition { + dim: rank_manifest.artifact.dim, + }, + )); + let report = + verify_manifest_with_base(rank_manifest, rank_case.path(), VerifyOptions::default()); + assert!(report.ok, "{:?}", report.errors); +} + +#[test] +fn calibration_profile_artifact_checks_are_enforced() { + let temp = tempfile::tempdir().unwrap(); + let case = tempfile::tempdir_in(temp.path()).unwrap(); + let profile_dir = case.path().join("profiles"); + fs::create_dir(&profile_dir).unwrap(); + let index = write_index_kind(case.path(), FixtureKind::Bitmap); + let manifest_path = case.path().join("manifest.json"); + let mut manifest = create_manifest_for_index( + &index, + CreateRowIdentity::RowIdIdentity, + "test-embedding", + &manifest_path, + ) + .unwrap(); + let profile_hash = write_profile( + &profile_dir.join("profile.f64"), + manifest.artifact.dim * std::mem::size_of::(), + ); + manifest.calibration = Some(weighted_calibration( + &manifest, + "profiles/profile.f64", + profile_hash.clone(), + CalibrationOrdinalization::TopK { + dim: manifest.artifact.dim, + k: 16, + }, + ProfileParameterization::MarginalTopKFrequency, + vec![manifest.artifact.dim], + )); + let report = verify_manifest_with_base(manifest.clone(), case.path(), VerifyOptions::default()); + assert!(report.ok, "{:?}", report.errors); + assert!(report.calibration.present); + assert_eq!( + report.calibration.profile_sha256.as_deref(), + Some(profile_hash.sha256.as_str()) + ); + + let mut missing_profile = manifest.clone(); + missing_profile.calibration.as_mut().unwrap().profile = None; + let report = verify_manifest_with_base(missing_profile, case.path(), VerifyOptions::default()); + assert!(error_codes(&report).contains(&"calibration_profile_required")); + + let mut unexpected_profile = manifest.clone(); + unexpected_profile.calibration.as_mut().unwrap().null_model = + NullModelSpec::UniformHypergeometric; + let report = + verify_manifest_with_base(unexpected_profile, case.path(), VerifyOptions::default()); + assert!(error_codes(&report).contains(&"calibration_profile_unexpected")); + + let mut hash_mismatch = manifest.clone(); + hash_mismatch + .calibration + .as_mut() + .unwrap() + .profile + .as_mut() + .unwrap() + .sha256 = "b".repeat(64); + let report = verify_manifest_with_base(hash_mismatch, case.path(), VerifyOptions::default()); + assert!(error_codes(&report).contains(&"calibration_profile_sha256_mismatch")); + + let mut size_mismatch = manifest.clone(); + size_mismatch + .calibration + .as_mut() + .unwrap() + .profile + .as_mut() + .unwrap() + .file_size_bytes += 8; + let report = verify_manifest_with_base(size_mismatch, case.path(), VerifyOptions::default()); + assert!(error_codes(&report).contains(&"calibration_profile_file_size_mismatch")); + + let mut zero_sample = manifest.clone(); + zero_sample + .calibration + .as_mut() + .unwrap() + .profile + .as_mut() + .unwrap() + .sample_count = 0; + let report = verify_manifest_with_base(zero_sample, case.path(), VerifyOptions::default()); + assert!(error_codes(&report).contains(&"calibration_profile_sample_count_zero")); + + let outside = temp.path().join("outside-profile.f64"); + let outside_hash = write_profile(&outside, manifest.artifact.dim * std::mem::size_of::()); + let mut escaped = manifest.clone(); + let escaped_profile = escaped + .calibration + .as_mut() + .unwrap() + .profile + .as_mut() + .unwrap(); + escaped_profile.path = "../outside-profile.f64".to_string(); + escaped_profile.sha256 = outside_hash.sha256.clone(); + escaped_profile.file_size_bytes = outside_hash.size_bytes; + let report = verify_manifest_with_base(escaped, case.path(), VerifyOptions::default()); + assert!(error_codes(&report).contains(&"calibration_profile_path_escape_rejected")); + + let mut absolute = manifest; + let absolute_profile = absolute + .calibration + .as_mut() + .unwrap() + .profile + .as_mut() + .unwrap(); + absolute_profile.path = outside.display().to_string(); + absolute_profile.sha256 = outside_hash.sha256; + absolute_profile.file_size_bytes = outside_hash.size_bytes; + let report = verify_manifest_with_base(absolute, case.path(), VerifyOptions::default()); + assert!(error_codes(&report).contains(&"calibration_profile_absolute_path_rejected")); +} + #[test] fn artifact_metadata_mismatches_are_reported_with_stable_codes() { let root = tempfile::tempdir().unwrap(); @@ -779,3 +1162,69 @@ fn sqlite_cache_is_explicit_and_activation_reverifies_by_default() { .iter() .any(|issue| issue.code == "sqlite_activation_forced")); } + +#[cfg(feature = "sqlite")] +#[test] +fn sqlite_cache_key_includes_calibration_profile_bytes() { + let temp = tempfile::tempdir().unwrap(); + let profile_dir = temp.path().join("profiles"); + fs::create_dir(&profile_dir).unwrap(); + let index = write_index_kind(temp.path(), FixtureKind::Bitmap); + let manifest_path = temp.path().join("manifest.json"); + let mut manifest = create_manifest_for_index( + &index, + CreateRowIdentity::RowIdIdentity, + "test-embedding", + &manifest_path, + ) + .unwrap(); + let profile_path = profile_dir.join("profile.f64"); + let profile_hash = write_profile( + &profile_path, + manifest.artifact.dim * std::mem::size_of::(), + ); + manifest.calibration = Some(weighted_calibration( + &manifest, + "profiles/profile.f64", + profile_hash, + CalibrationOrdinalization::TopK { + dim: manifest.artifact.dim, + k: 16, + }, + ProfileParameterization::MarginalTopKFrequency, + vec![manifest.artifact.dim], + )); + fs::write( + &manifest_path, + serde_json::to_string_pretty(&manifest).unwrap(), + ) + .unwrap(); + let document = load_manifest_file(&manifest_path).unwrap(); + let db = temp.path().join("registry.sqlite"); + + let report = ordvec_manifest::sqlite::verify_with_registry( + &db, + &document, + &manifest_path, + VerifyOptions::default(), + true, + ) + .unwrap(); + assert!(report.ok, "{:?}", report.errors); + + fs::write( + &profile_path, + vec![1u8; manifest.artifact.dim * std::mem::size_of::()], + ) + .unwrap(); + let cached = ordvec_manifest::sqlite::verify_with_registry( + &db, + &document, + &manifest_path, + VerifyOptions::default(), + true, + ) + .unwrap(); + assert!(!cached.ok, "profile drift must force fresh verification"); + assert!(error_codes(&cached).contains(&"calibration_profile_sha256_mismatch")); +} From 18cb410aad0c5ff44476e359c9aba90143c92b7f Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Fri, 29 May 2026 10:16:25 -0500 Subject: [PATCH 5/6] avoid release invariant grep sigpipe Signed-off-by: Nelson Spence --- tests/release_signed_release_invariants.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/release_signed_release_invariants.sh b/tests/release_signed_release_invariants.sh index 2aa6542..99542e0 100755 --- a/tests/release_signed_release_invariants.sh +++ b/tests/release_signed_release_invariants.sh @@ -47,7 +47,9 @@ job_body() { # Accept both `needs: [a, b, c]` (inline) and `needs:\n - a\n - b` (block) forms. job_needs() { local jobname="$1" needed="$2" - job_body "$jobname" | grep -qE "(^[[:space:]]+needs:.*\\b${needed}\\b|^[[:space:]]+-[[:space:]]+${needed}[[:space:]]*$)" + local body + body="$(job_body "$jobname")" + printf '%s\n' "$body" | grep -qE "(^[[:space:]]+needs:.*\\b${needed}\\b|^[[:space:]]+-[[:space:]]+${needed}[[:space:]]*$)" } # ---------------------------------------------------------------------- From a2249556633a1e5ca0d9e2f3b877e331370264d9 Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Fri, 29 May 2026 10:31:43 -0500 Subject: [PATCH 6/6] tighten calibration null semantics Signed-off-by: Nelson Spence --- docs/INDEX_PROVENANCE.md | 4 +- ordvec-manifest/src/lib.rs | 61 +++++++++++++++++ ordvec-manifest/tests/manifest.rs | 105 ++++++++++++++++++++++++++++-- 3 files changed, 165 insertions(+), 5 deletions(-) diff --git a/docs/INDEX_PROVENANCE.md b/docs/INDEX_PROVENANCE.md index ec4af7a..be3ab2b 100644 --- a/docs/INDEX_PROVENANCE.md +++ b/docs/INDEX_PROVENANCE.md @@ -76,7 +76,9 @@ null. The verifier checks profile identity, path/hash integrity, encoder identity, and ordinalization compatibility; it does not judge whether the null model is scientifically adequate and does not compute likelihood ratios or tail probabilities. Calibration profiles must match the encoder identity declared by -`embedding`; cross-encoder calibration is rejected by default. +`embedding`; cross-encoder calibration is rejected by default. The +`uniform_hypergeometric` null is reserved for top-K overlap calibration and is +not accepted for bucket, sign, or rank-position ordinalizations. Recipes that consume sidecar manifests should run the verifier first, then load/search/rerank only if verification succeeds. diff --git a/ordvec-manifest/src/lib.rs b/ordvec-manifest/src/lib.rs index 991f10c..6202fdb 100644 --- a/ordvec-manifest/src/lib.rs +++ b/ordvec-manifest/src/lib.rs @@ -548,6 +548,7 @@ fn verify_calibration( validate_calibration_shape(calibration, report); validate_calibration_encoder(calibration, &document.manifest.embedding, report); validate_calibration_ordinalization(calibration, &document.manifest.artifact, report); + validate_calibration_null_model_ordinalization(calibration, report); validate_calibration_profile( calibration, &document.manifest.artifact, @@ -773,6 +774,30 @@ fn validate_calibration_ordinalization( } } +fn validate_calibration_null_model_ordinalization( + calibration: &CalibrationProfileRef, + report: &mut VerificationReport, +) { + if matches!( + (&calibration.null_model, &calibration.ordinalization), + ( + NullModelSpec::UniformHypergeometric, + CalibrationOrdinalization::TopK { .. } + ) + ) { + return; + } + if matches!( + &calibration.null_model, + NullModelSpec::UniformHypergeometric + ) { + report.error( + "calibration_null_model_ordinalization_mismatch", + "uniform_hypergeometric calibration requires top_k ordinalization", + ); + } +} + fn validate_calibration_profile( calibration: &CalibrationProfileRef, artifact: &Artifact, @@ -928,6 +953,42 @@ fn validate_calibration_parameterization( } _ => {} } + if !profile_parameterization_matches_ordinalization( + profile.parameterization, + &calibration.ordinalization, + ) { + report.error( + "calibration_profile_parameterization_ordinalization_mismatch", + "calibration profile parameterization is incompatible with calibration ordinalization", + ); + } +} + +fn profile_parameterization_matches_ordinalization( + parameterization: ProfileParameterization, + ordinalization: &CalibrationOrdinalization, +) -> bool { + match ordinalization { + CalibrationOrdinalization::TopK { .. } => matches!( + parameterization, + ProfileParameterization::MarginalTopKFrequency + | ProfileParameterization::EmpiricalTailTable + ), + CalibrationOrdinalization::Bucket { .. } => matches!( + parameterization, + ProfileParameterization::BucketFrequency | ProfileParameterization::EmpiricalTailTable + ), + CalibrationOrdinalization::Sign { .. } => matches!( + parameterization, + ProfileParameterization::SignFrequency | ProfileParameterization::EmpiricalTailTable + ), + CalibrationOrdinalization::RankPosition { .. } => matches!( + parameterization, + ProfileParameterization::RankPositionFrequency + | ProfileParameterization::EmpiricalTailTable + ), + CalibrationOrdinalization::CallerDefined { .. } => true, + } } fn validate_calibration_profile_shape( diff --git a/ordvec-manifest/tests/manifest.rs b/ordvec-manifest/tests/manifest.rs index 22a8e3c..4d5cf9a 100644 --- a/ordvec-manifest/tests/manifest.rs +++ b/ordvec-manifest/tests/manifest.rs @@ -409,6 +409,7 @@ fn calibration_ordinalization_matches_artifact_formats() { let rq_case = tempfile::tempdir_in(temp.path()).unwrap(); let rank_quant = write_index_kind(rq_case.path(), FixtureKind::RankQuant); let rq_manifest_path = rq_case.path().join("manifest.json"); + let rq_profile_hash = write_profile(&rq_case.path().join("bucket.f64"), 16 * 4 * 8); let mut rq_manifest = create_manifest_for_index( &rank_quant, CreateRowIdentity::RowIdIdentity, @@ -416,12 +417,16 @@ fn calibration_ordinalization_matches_artifact_formats() { &rq_manifest_path, ) .unwrap(); - rq_manifest.calibration = Some(uniform_calibration( + rq_manifest.calibration = Some(weighted_calibration( &rq_manifest, + "bucket.f64", + rq_profile_hash.clone(), CalibrationOrdinalization::Bucket { dim: rq_manifest.artifact.dim, bits: 2, }, + ProfileParameterization::BucketFrequency, + vec![rq_manifest.artifact.dim, 4], )); let report = verify_manifest_with_base( rq_manifest.clone(), @@ -429,12 +434,16 @@ fn calibration_ordinalization_matches_artifact_formats() { VerifyOptions::default(), ); assert!(report.ok, "{:?}", report.errors); - rq_manifest.calibration = Some(uniform_calibration( + rq_manifest.calibration = Some(weighted_calibration( &rq_manifest, + "bucket.f64", + rq_profile_hash, CalibrationOrdinalization::Bucket { dim: rq_manifest.artifact.dim, bits: 4, }, + ProfileParameterization::BucketFrequency, + vec![rq_manifest.artifact.dim, 4], )); let report = verify_manifest_with_base(rq_manifest, rq_case.path(), VerifyOptions::default()); assert!(error_codes(&report).contains(&"calibration_ordinalization_artifact_mismatch")); @@ -442,6 +451,7 @@ fn calibration_ordinalization_matches_artifact_formats() { let sign_case = tempfile::tempdir_in(temp.path()).unwrap(); let sign = write_index_kind(sign_case.path(), FixtureKind::SignBitmap); let sign_manifest_path = sign_case.path().join("manifest.json"); + let sign_profile_hash = write_profile(&sign_case.path().join("sign.f64"), 64 * 8); let mut sign_manifest = create_manifest_for_index( &sign, CreateRowIdentity::RowIdIdentity, @@ -449,11 +459,15 @@ fn calibration_ordinalization_matches_artifact_formats() { &sign_manifest_path, ) .unwrap(); - sign_manifest.calibration = Some(uniform_calibration( + sign_manifest.calibration = Some(weighted_calibration( &sign_manifest, + "sign.f64", + sign_profile_hash, CalibrationOrdinalization::Sign { dim: sign_manifest.artifact.dim, }, + ProfileParameterization::SignFrequency, + vec![sign_manifest.artifact.dim], )); let report = verify_manifest_with_base(sign_manifest, sign_case.path(), VerifyOptions::default()); @@ -462,6 +476,7 @@ fn calibration_ordinalization_matches_artifact_formats() { let rank_case = tempfile::tempdir_in(temp.path()).unwrap(); let rank = write_index_kind(rank_case.path(), FixtureKind::Rank); let rank_manifest_path = rank_case.path().join("manifest.json"); + let rank_profile_hash = write_profile(&rank_case.path().join("rank-position.f64"), 8 * 8 * 8); let mut rank_manifest = create_manifest_for_index( &rank, CreateRowIdentity::RowIdIdentity, @@ -469,17 +484,83 @@ fn calibration_ordinalization_matches_artifact_formats() { &rank_manifest_path, ) .unwrap(); - rank_manifest.calibration = Some(uniform_calibration( + rank_manifest.calibration = Some(weighted_calibration( &rank_manifest, + "rank-position.f64", + rank_profile_hash, CalibrationOrdinalization::RankPosition { dim: rank_manifest.artifact.dim, }, + ProfileParameterization::RankPositionFrequency, + vec![rank_manifest.artifact.dim, rank_manifest.artifact.dim], )); let report = verify_manifest_with_base(rank_manifest, rank_case.path(), VerifyOptions::default()); assert!(report.ok, "{:?}", report.errors); } +#[test] +fn uniform_hypergeometric_requires_top_k_ordinalization() { + let temp = tempfile::tempdir().unwrap(); + + let bitmap_case = tempfile::tempdir_in(temp.path()).unwrap(); + let bitmap = write_index_kind(bitmap_case.path(), FixtureKind::Bitmap); + let bitmap_manifest_path = bitmap_case.path().join("manifest.json"); + let mut bitmap_manifest = create_manifest_for_index( + &bitmap, + CreateRowIdentity::RowIdIdentity, + "test-embedding", + &bitmap_manifest_path, + ) + .unwrap(); + bitmap_manifest.calibration = Some(uniform_calibration( + &bitmap_manifest, + CalibrationOrdinalization::TopK { + dim: bitmap_manifest.artifact.dim, + k: 16, + }, + )); + let report = verify_manifest_with_base( + bitmap_manifest, + bitmap_case.path(), + VerifyOptions::default(), + ); + assert!(report.ok, "{:?}", report.errors); + + for (kind, ordinalization) in [ + ( + FixtureKind::RankQuant, + CalibrationOrdinalization::Bucket { dim: 16, bits: 2 }, + ), + ( + FixtureKind::SignBitmap, + CalibrationOrdinalization::Sign { dim: 64 }, + ), + ( + FixtureKind::Rank, + CalibrationOrdinalization::RankPosition { dim: 8 }, + ), + ] { + let case = tempfile::tempdir_in(temp.path()).unwrap(); + let index = write_index_kind(case.path(), kind); + let manifest_path = case.path().join("manifest.json"); + let mut manifest = create_manifest_for_index( + &index, + CreateRowIdentity::RowIdIdentity, + "test-embedding", + &manifest_path, + ) + .unwrap(); + manifest.calibration = Some(uniform_calibration(&manifest, ordinalization)); + let report = verify_manifest_with_base(manifest, case.path(), VerifyOptions::default()); + assert!( + error_codes(&report).contains(&"calibration_null_model_ordinalization_mismatch"), + "expected uniform_hypergeometric rejection: {:?}", + report.errors + ); + } +} + #[test] fn calibration_profile_artifact_checks_are_enforced() { let temp = tempfile::tempdir().unwrap(); @@ -566,6 +647,22 @@ fn calibration_profile_artifact_checks_are_enforced() { let report = verify_manifest_with_base(zero_sample, case.path(), VerifyOptions::default()); assert!(error_codes(&report).contains(&"calibration_profile_sample_count_zero")); + let mut wrong_parameterization = manifest.clone(); + let wrong_calibration = wrong_parameterization.calibration.as_mut().unwrap(); + let wrong_profile = wrong_calibration.profile.as_mut().unwrap(); + wrong_profile.parameterization = ProfileParameterization::BucketFrequency; + wrong_profile.shape.clear(); + wrong_calibration.null_model = NullModelSpec::WeightedMarginalProfile { + parameterization: ProfileParameterization::BucketFrequency, + }; + let report = verify_manifest_with_base( + wrong_parameterization, + case.path(), + VerifyOptions::default(), + ); + assert!(error_codes(&report) + .contains(&"calibration_profile_parameterization_ordinalization_mismatch")); + let outside = temp.path().join("outside-profile.f64"); let outside_hash = write_profile(&outside, manifest.artifact.dim * std::mem::size_of::()); let mut escaped = manifest.clone();