From 73464102bc943fe428913a3cecf636df4ab5ffc6 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Tue, 21 Apr 2026 14:49:48 -0400 Subject: [PATCH 1/2] Add preservation-mirror fields to DataReleaseManifest MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extends the data-release manifest model to carry optional preservation-grade mirror metadata: - New PreservationMirror model with kind ('zenodo', 'archival_gcs', etc.), url, and optional doi / sha256 / deposited_at fields. - New preservation_mirrors list on each DataReleaseArtifact, for per-artifact mirrors (Zenodo file deposits, GCS archival copies). - New preservation_dois list on DataReleaseManifest for release-level DOIs (Zenodo mints one per deposit covering all files). All new fields have defaults and the existing manifest JSON schema continues to validate unchanged — verified with a backwards- compatibility test that loads a legacy manifest JSON blob. This is the data contract for the Zenodo-mirror workstream scoped in PolicyEngine/policyengine-us-data#810: the us-data Modal build will deposit each certified h5 to Zenodo and populate these fields when emitting the DataReleaseManifest to HuggingFace. The TRACE TRO emission helpers will then read preservation_mirrors / preservation_dois to record durable fallback locations in every TRO it builds. Motivation (2026-04-21 meeting with Lars Vilhuber / AEA Data Editor): HuggingFace doesn't publish a preservation commitment, so a TRO citation URL that resolves only through HF can 404 decades from now. Zenodo (CERN / OpenAIRE-operated, DOI-minting) is the reference preservation-grade host Lars pointed at. 9 new tests; full non-integration suite green (444 passed). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../add-preservation-mirror-fields.added.md | 1 + src/policyengine/provenance/manifest.py | 37 ++++ tests/test_preservation_mirror.py | 179 ++++++++++++++++++ uv.lock | 6 +- 4 files changed, 220 insertions(+), 3 deletions(-) create mode 100644 changelog.d/add-preservation-mirror-fields.added.md create mode 100644 tests/test_preservation_mirror.py diff --git a/changelog.d/add-preservation-mirror-fields.added.md b/changelog.d/add-preservation-mirror-fields.added.md new file mode 100644 index 00000000..9190204b --- /dev/null +++ b/changelog.d/add-preservation-mirror-fields.added.md @@ -0,0 +1 @@ +Extended `DataReleaseManifest` with optional preservation-mirror fields. Added a `PreservationMirror` model (kind, url, optional doi / sha256 / deposited_at), a `preservation_mirrors` list on each `DataReleaseArtifact`, and a top-level `preservation_dois` list on the manifest. Back-compatible: all new fields have defaults, and pre-existing manifests continue to validate. Populated by release pipelines that mirror artifacts to Zenodo or another preservation-grade host; addresses PolicyEngine/policyengine-us-data#810. diff --git a/src/policyengine/provenance/manifest.py b/src/policyengine/provenance/manifest.py index f4929554..aa4e644b 100644 --- a/src/policyengine/provenance/manifest.py +++ b/src/policyengine/provenance/manifest.py @@ -60,6 +60,34 @@ def resolve(self, **kwargs: str) -> str: return self.path_template.format(**kwargs) +class PreservationMirror(BaseModel): + """Durable mirror of a data artifact on a preservation-grade host. + + The primary host for PolicyEngine calibrated h5 files is Hugging + Face, which is fast and integrated with the Python client but does + not publish a preservation commitment. A ``PreservationMirror`` + records an additional copy on a host that *does* publish one + (Zenodo, the Internet Archive, or an institutional archive), so a + TRO citation URL can fall back if the primary location is ever + unavailable. + """ + + kind: str + """Short identifier for the preservation host: ``zenodo``, ``internet_archive``, ``archival_gcs``, etc.""" + + url: str + """Dereferenceable HTTPS URL for the mirrored artifact.""" + + doi: Optional[str] = None + """DOI for the preservation deposit, when the host assigns one (Zenodo always does).""" + + sha256: Optional[str] = None + """Content hash of the mirrored bytes. When equal to the primary artifact's ``sha256``, the mirror is byte-identical and the hash can be reused for verification.""" + + deposited_at: Optional[str] = None + """ISO 8601 timestamp of when the mirror was deposited, if known.""" + + class DataReleaseArtifact(BaseModel): kind: str path: str @@ -67,6 +95,10 @@ class DataReleaseArtifact(BaseModel): revision: str sha256: Optional[str] = None size_bytes: Optional[int] = None + preservation_mirrors: list[PreservationMirror] = Field(default_factory=list) + """Durable secondary locations for this artifact. Populated when the + release pipeline mirrors the artifact to a preservation-grade host. + Empty when no preservation deposit exists yet.""" @property def uri(self) -> str: @@ -86,6 +118,11 @@ class DataReleaseManifest(BaseModel): default_datasets: dict[str, str] = Field(default_factory=dict) build: Optional[DataBuildInfo] = None artifacts: dict[str, DataReleaseArtifact] = Field(default_factory=dict) + preservation_dois: list[str] = Field(default_factory=list) + """DOIs covering the release as a whole (Zenodo concept: one DOI + can enclose the full set of artifacts published together). Distinct + from per-artifact DOIs on ``DataReleaseArtifact.preservation_mirrors``. + Populated when the release pipeline mirrors to a DOI-minting host.""" class DataCertification(BaseModel): diff --git a/tests/test_preservation_mirror.py b/tests/test_preservation_mirror.py new file mode 100644 index 00000000..f9619b63 --- /dev/null +++ b/tests/test_preservation_mirror.py @@ -0,0 +1,179 @@ +"""Tests for the PreservationMirror extension to DataReleaseManifest.""" + +import pytest + +from policyengine.provenance.manifest import ( + DataPackageVersion, + DataReleaseArtifact, + DataReleaseManifest, + PackageVersion, + PreservationMirror, +) + + +class TestPreservationMirror: + def test_minimal_mirror_has_only_kind_and_url(self): + mirror = PreservationMirror( + kind="zenodo", + url="https://zenodo.org/records/10000000/files/enhanced_cps_2024.h5", + ) + assert mirror.kind == "zenodo" + assert mirror.url.endswith("enhanced_cps_2024.h5") + assert mirror.doi is None + assert mirror.sha256 is None + assert mirror.deposited_at is None + + def test_full_mirror_roundtrips_through_pydantic(self): + mirror = PreservationMirror( + kind="zenodo", + url="https://zenodo.org/records/10000000/files/enhanced_cps_2024.h5", + doi="10.5281/zenodo.10000000", + sha256="a" * 64, + deposited_at="2026-04-21T12:00:00Z", + ) + dumped = mirror.model_dump() + round_tripped = PreservationMirror.model_validate(dumped) + assert round_tripped == mirror + + +class TestDataReleaseArtifactWithMirror: + def test_artifact_defaults_to_no_mirrors(self): + artifact = DataReleaseArtifact( + kind="dataset", + path="enhanced_cps_2024.h5", + repo_id="policyengine/policyengine-us-data", + revision="1.85.2", + sha256="a" * 64, + ) + assert artifact.preservation_mirrors == [] + + def test_artifact_accepts_multiple_mirrors(self): + mirrors = [ + PreservationMirror( + kind="zenodo", + url="https://zenodo.org/records/10000000/files/enhanced_cps_2024.h5", + doi="10.5281/zenodo.10000000", + ), + PreservationMirror( + kind="archival_gcs", + url="https://storage.googleapis.com/policyengine-preservation/enhanced_cps_2024.h5", + ), + ] + artifact = DataReleaseArtifact( + kind="dataset", + path="enhanced_cps_2024.h5", + repo_id="policyengine/policyengine-us-data", + revision="1.85.2", + sha256="a" * 64, + preservation_mirrors=mirrors, + ) + assert len(artifact.preservation_mirrors) == 2 + assert {m.kind for m in artifact.preservation_mirrors} == { + "zenodo", + "archival_gcs", + } + + def test_primary_uri_still_derives_from_hf_fields(self): + artifact = DataReleaseArtifact( + kind="dataset", + path="enhanced_cps_2024.h5", + repo_id="policyengine/policyengine-us-data", + revision="1.85.2", + preservation_mirrors=[ + PreservationMirror( + kind="zenodo", + url="https://zenodo.org/records/10000000/files/enhanced_cps_2024.h5", + ) + ], + ) + assert artifact.uri.startswith("hf://") + assert "policyengine-us-data" in artifact.uri + + +class TestDataReleaseManifestPreservationDois: + def test_manifest_defaults_to_empty_preservation_dois(self): + manifest = DataReleaseManifest( + schema_version=1, + data_package=DataPackageVersion( + name="policyengine-us-data", + version="1.85.2", + repo_id="policyengine/policyengine-us-data", + ), + ) + assert manifest.preservation_dois == [] + + def test_manifest_carries_dois_when_deposited(self): + manifest = DataReleaseManifest( + schema_version=1, + data_package=DataPackageVersion( + name="policyengine-us-data", + version="1.85.2", + repo_id="policyengine/policyengine-us-data", + ), + preservation_dois=[ + "10.5281/zenodo.10000000", + "10.5281/zenodo.10000001", + ], + ) + assert manifest.preservation_dois == [ + "10.5281/zenodo.10000000", + "10.5281/zenodo.10000001", + ] + + def test_round_trip_with_preservation_metadata(self): + mirror = PreservationMirror( + kind="zenodo", + url="https://zenodo.org/records/10000000/files/enhanced_cps_2024.h5", + doi="10.5281/zenodo.10000000", + sha256="b" * 64, + ) + artifact = DataReleaseArtifact( + kind="dataset", + path="enhanced_cps_2024.h5", + repo_id="policyengine/policyengine-us-data", + revision="1.85.2", + sha256="b" * 64, + preservation_mirrors=[mirror], + ) + # Use PackageVersion (not DataPackageVersion) because the + # manifest's ``data_package`` field is typed as the base class. + # Round-tripping through JSON serialization narrows any + # subclass instance back to the declared type, which is fine + # for our purposes but makes strict equality unstable. + manifest = DataReleaseManifest( + schema_version=1, + data_package=PackageVersion( + name="policyengine-us-data", + version="1.85.2", + ), + artifacts={"enhanced_cps_2024": artifact}, + preservation_dois=["10.5281/zenodo.10000000"], + ) + json_bytes = manifest.model_dump_json().encode("utf-8") + restored = DataReleaseManifest.model_validate_json(json_bytes) + assert restored == manifest + + +def test_backwards_compatibility_old_manifest_without_preservation_fields(): + """A manifest JSON emitted before the preservation fields existed must + still validate — the new fields have defaults, not required flags.""" + legacy_json = """{ + "schema_version": 1, + "data_package": { + "name": "policyengine-us-data", + "version": "1.78.2", + "repo_id": "policyengine/policyengine-us-data" + }, + "artifacts": { + "enhanced_cps_2024": { + "kind": "dataset", + "path": "enhanced_cps_2024.h5", + "repo_id": "policyengine/policyengine-us-data", + "revision": "1.78.2" + } + } + }""" + manifest = DataReleaseManifest.model_validate_json(legacy_json) + assert manifest.preservation_dois == [] + artifact = manifest.artifacts["enhanced_cps_2024"] + assert artifact.preservation_mirrors == [] diff --git a/uv.lock b/uv.lock index 65678a22..19e149a1 100644 --- a/uv.lock +++ b/uv.lock @@ -2417,9 +2417,10 @@ wheels = [ [[package]] name = "policyengine" -version = "4.3.0" +version = "4.3.1" source = { editable = "." } dependencies = [ + { name = "jsonschema" }, { name = "microdf-python" }, { name = "packaging" }, { name = "pandas" }, @@ -2434,7 +2435,6 @@ dev = [ { name = "build" }, { name = "furo" }, { name = "itables" }, - { name = "jsonschema" }, { name = "jupyter-book" }, { name = "mypy", version = "1.19.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, { name = "mypy", version = "1.20.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, @@ -2472,7 +2472,7 @@ requires-dist = [ { name = "build", marker = "extra == 'dev'" }, { name = "furo", marker = "extra == 'dev'" }, { name = "itables", marker = "extra == 'dev'" }, - { name = "jsonschema", marker = "extra == 'dev'", specifier = ">=4.0.0" }, + { name = "jsonschema", specifier = ">=4.0.0" }, { name = "jupyter-book", marker = "extra == 'dev'" }, { name = "microdf-python", specifier = ">=1.2.1" }, { name = "mypy", marker = "extra == 'dev'", specifier = ">=1.11.0" }, From b2d3e35d5a4128236e393a5ce745cab1f96f9670 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Thu, 7 May 2026 06:16:46 -0500 Subject: [PATCH 2/2] Remove unused preservation mirror test import --- tests/test_preservation_mirror.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/test_preservation_mirror.py b/tests/test_preservation_mirror.py index f9619b63..76c201ef 100644 --- a/tests/test_preservation_mirror.py +++ b/tests/test_preservation_mirror.py @@ -1,7 +1,5 @@ """Tests for the PreservationMirror extension to DataReleaseManifest.""" -import pytest - from policyengine.provenance.manifest import ( DataPackageVersion, DataReleaseArtifact,