Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/add-preservation-mirror-fields.added.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Extended `DataReleaseManifest` with optional preservation-mirror fields. Added a `PreservationMirror` model (kind, url, optional doi / sha256 / deposited_at), a `preservation_mirrors` list on each `DataReleaseArtifact`, and a top-level `preservation_dois` list on the manifest. Back-compatible: all new fields have defaults, and pre-existing manifests continue to validate. Populated by release pipelines that mirror artifacts to Zenodo or another preservation-grade host; addresses PolicyEngine/policyengine-us-data#810.
37 changes: 37 additions & 0 deletions src/policyengine/provenance/manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,13 +60,45 @@ def resolve(self, **kwargs: str) -> str:
return self.path_template.format(**kwargs)


class PreservationMirror(BaseModel):
"""Durable mirror of a data artifact on a preservation-grade host.

The primary host for PolicyEngine calibrated h5 files is Hugging
Face, which is fast and integrated with the Python client but does
not publish a preservation commitment. A ``PreservationMirror``
records an additional copy on a host that *does* publish one
(Zenodo, the Internet Archive, or an institutional archive), so a
TRO citation URL can fall back if the primary location is ever
unavailable.
"""

kind: str
"""Short identifier for the preservation host: ``zenodo``, ``internet_archive``, ``archival_gcs``, etc."""

url: str
"""Dereferenceable HTTPS URL for the mirrored artifact."""

doi: Optional[str] = None
"""DOI for the preservation deposit, when the host assigns one (Zenodo always does)."""

sha256: Optional[str] = None
"""Content hash of the mirrored bytes. When equal to the primary artifact's ``sha256``, the mirror is byte-identical and the hash can be reused for verification."""

deposited_at: Optional[str] = None
"""ISO 8601 timestamp of when the mirror was deposited, if known."""


class DataReleaseArtifact(BaseModel):
kind: str
path: str
repo_id: str
revision: str
sha256: Optional[str] = None
size_bytes: Optional[int] = None
preservation_mirrors: list[PreservationMirror] = Field(default_factory=list)
"""Durable secondary locations for this artifact. Populated when the
release pipeline mirrors the artifact to a preservation-grade host.
Empty when no preservation deposit exists yet."""

@property
def uri(self) -> str:
Expand All @@ -86,6 +118,11 @@ class DataReleaseManifest(BaseModel):
default_datasets: dict[str, str] = Field(default_factory=dict)
build: Optional[DataBuildInfo] = None
artifacts: dict[str, DataReleaseArtifact] = Field(default_factory=dict)
preservation_dois: list[str] = Field(default_factory=list)
"""DOIs covering the release as a whole (Zenodo concept: one DOI
can enclose the full set of artifacts published together). Distinct
from per-artifact DOIs on ``DataReleaseArtifact.preservation_mirrors``.
Populated when the release pipeline mirrors to a DOI-minting host."""


class DataCertification(BaseModel):
Expand Down
177 changes: 177 additions & 0 deletions tests/test_preservation_mirror.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
"""Tests for the PreservationMirror extension to DataReleaseManifest."""

from policyengine.provenance.manifest import (
DataPackageVersion,
DataReleaseArtifact,
DataReleaseManifest,
PackageVersion,
PreservationMirror,
)


class TestPreservationMirror:
def test_minimal_mirror_has_only_kind_and_url(self):
mirror = PreservationMirror(
kind="zenodo",
url="https://zenodo.org/records/10000000/files/enhanced_cps_2024.h5",
)
assert mirror.kind == "zenodo"
assert mirror.url.endswith("enhanced_cps_2024.h5")
assert mirror.doi is None
assert mirror.sha256 is None
assert mirror.deposited_at is None

def test_full_mirror_roundtrips_through_pydantic(self):
mirror = PreservationMirror(
kind="zenodo",
url="https://zenodo.org/records/10000000/files/enhanced_cps_2024.h5",
doi="10.5281/zenodo.10000000",
sha256="a" * 64,
deposited_at="2026-04-21T12:00:00Z",
)
dumped = mirror.model_dump()
round_tripped = PreservationMirror.model_validate(dumped)
assert round_tripped == mirror


class TestDataReleaseArtifactWithMirror:
def test_artifact_defaults_to_no_mirrors(self):
artifact = DataReleaseArtifact(
kind="dataset",
path="enhanced_cps_2024.h5",
repo_id="policyengine/policyengine-us-data",
revision="1.85.2",
sha256="a" * 64,
)
assert artifact.preservation_mirrors == []

def test_artifact_accepts_multiple_mirrors(self):
mirrors = [
PreservationMirror(
kind="zenodo",
url="https://zenodo.org/records/10000000/files/enhanced_cps_2024.h5",
doi="10.5281/zenodo.10000000",
),
PreservationMirror(
kind="archival_gcs",
url="https://storage.googleapis.com/policyengine-preservation/enhanced_cps_2024.h5",
),
]
artifact = DataReleaseArtifact(
kind="dataset",
path="enhanced_cps_2024.h5",
repo_id="policyengine/policyengine-us-data",
revision="1.85.2",
sha256="a" * 64,
preservation_mirrors=mirrors,
)
assert len(artifact.preservation_mirrors) == 2
assert {m.kind for m in artifact.preservation_mirrors} == {
"zenodo",
"archival_gcs",
}

def test_primary_uri_still_derives_from_hf_fields(self):
artifact = DataReleaseArtifact(
kind="dataset",
path="enhanced_cps_2024.h5",
repo_id="policyengine/policyengine-us-data",
revision="1.85.2",
preservation_mirrors=[
PreservationMirror(
kind="zenodo",
url="https://zenodo.org/records/10000000/files/enhanced_cps_2024.h5",
)
],
)
assert artifact.uri.startswith("hf://")
assert "policyengine-us-data" in artifact.uri


class TestDataReleaseManifestPreservationDois:
def test_manifest_defaults_to_empty_preservation_dois(self):
manifest = DataReleaseManifest(
schema_version=1,
data_package=DataPackageVersion(
name="policyengine-us-data",
version="1.85.2",
repo_id="policyengine/policyengine-us-data",
),
)
assert manifest.preservation_dois == []

def test_manifest_carries_dois_when_deposited(self):
manifest = DataReleaseManifest(
schema_version=1,
data_package=DataPackageVersion(
name="policyengine-us-data",
version="1.85.2",
repo_id="policyengine/policyengine-us-data",
),
preservation_dois=[
"10.5281/zenodo.10000000",
"10.5281/zenodo.10000001",
],
)
assert manifest.preservation_dois == [
"10.5281/zenodo.10000000",
"10.5281/zenodo.10000001",
]

def test_round_trip_with_preservation_metadata(self):
mirror = PreservationMirror(
kind="zenodo",
url="https://zenodo.org/records/10000000/files/enhanced_cps_2024.h5",
doi="10.5281/zenodo.10000000",
sha256="b" * 64,
)
artifact = DataReleaseArtifact(
kind="dataset",
path="enhanced_cps_2024.h5",
repo_id="policyengine/policyengine-us-data",
revision="1.85.2",
sha256="b" * 64,
preservation_mirrors=[mirror],
)
# Use PackageVersion (not DataPackageVersion) because the
# manifest's ``data_package`` field is typed as the base class.
# Round-tripping through JSON serialization narrows any
# subclass instance back to the declared type, which is fine
# for our purposes but makes strict equality unstable.
manifest = DataReleaseManifest(
schema_version=1,
data_package=PackageVersion(
name="policyengine-us-data",
version="1.85.2",
),
artifacts={"enhanced_cps_2024": artifact},
preservation_dois=["10.5281/zenodo.10000000"],
)
json_bytes = manifest.model_dump_json().encode("utf-8")
restored = DataReleaseManifest.model_validate_json(json_bytes)
assert restored == manifest


def test_backwards_compatibility_old_manifest_without_preservation_fields():
"""A manifest JSON emitted before the preservation fields existed must
still validate — the new fields have defaults, not required flags."""
legacy_json = """{
"schema_version": 1,
"data_package": {
"name": "policyengine-us-data",
"version": "1.78.2",
"repo_id": "policyengine/policyengine-us-data"
},
"artifacts": {
"enhanced_cps_2024": {
"kind": "dataset",
"path": "enhanced_cps_2024.h5",
"repo_id": "policyengine/policyengine-us-data",
"revision": "1.78.2"
}
}
}"""
manifest = DataReleaseManifest.model_validate_json(legacy_json)
assert manifest.preservation_dois == []
artifact = manifest.artifacts["enhanced_cps_2024"]
assert artifact.preservation_mirrors == []
6 changes: 3 additions & 3 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading