Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/longterm-release-refresh.fixed.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Refresh US long-term dataset hash pins from data release manifests.
16 changes: 16 additions & 0 deletions scripts/refresh_release_bundle.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,20 @@ def main(argv: list[str] | None = None) -> int:
"--data-version",
help="New policyengine-{country}-data version (e.g. 1.83.4)",
)
parser.add_argument(
"--release-manifest-path",
help=(
"Override the data release manifest path, e.g. "
"releases/crfb-longrun-20260518/release_manifest.json"
),
)
parser.add_argument(
"--release-manifest-revision",
help=(
"HF revision to fetch the data release manifest from before "
"pinning the immutable repo commit."
),
)
parser.add_argument(
"--no-pyproject",
action="store_true",
Expand All @@ -67,6 +81,8 @@ def main(argv: list[str] | None = None) -> int:
country=args.country,
model_version=args.model_version,
data_version=args.data_version,
release_manifest_path=args.release_manifest_path,
release_manifest_revision=args.release_manifest_revision,
update_pyproject=not args.no_pyproject,
)
print(result.summary())
Expand Down
138 changes: 122 additions & 16 deletions src/policyengine/provenance/bundle.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,8 @@ def _fetch_data_release_manifest(
repo_id: str,
release_manifest_path: str,
revision: str,
*,
allow_main_fallback: bool = True,
) -> Optional[_DataReleaseManifestFetch]:
"""Fetch a data release manifest from HF if one is available.

Expand All @@ -192,17 +194,19 @@ def _fetch_data_release_manifest(
dataset artifact directly.

Data releases are stored under versioned paths, but the HF repository does
not necessarily create a matching git tag for each data version. Try the
version revision first for repositories that do publish tags, then fall
back to ``main`` and persist the immutable ``x-repo-commit`` header.
not necessarily create a matching git tag for each data version. For
inferred data-version revisions, try the version revision first for
repositories that do publish tags, then fall back to ``main`` and persist
the immutable ``x-repo-commit`` header. Explicit revisions do not get that
fallback because a typo or stale CRFB run ref should fail closed.
"""
headers = {"User-Agent": "policyengine.py"}
token = os.environ.get("HUGGING_FACE_TOKEN") or os.environ.get("HF_TOKEN")
if token:
headers["Authorization"] = f"Bearer {token}"

revisions = [revision]
if revision != "main":
if allow_main_fallback and revision != "main":
revisions.append("main")

for candidate in revisions:
Expand Down Expand Up @@ -234,6 +238,83 @@ def _updated_release_manifest_path(
return current_path


def _release_artifact_by_path(
release_manifest_json: dict,
path: str,
) -> Optional[dict]:
artifacts = release_manifest_json.get("artifacts", {})
for artifact in artifacts.values():
if artifact.get("path") == path:
return artifact
return None


def _metadata_sidecar_path(path: str) -> str:
return f"{path}.metadata.json"


def _refresh_dataset_path_references_from_data_release(
manifest_json: dict,
release_manifest_json: dict,
) -> None:
"""Refresh bundled dataset hash pins from a data release manifest.

The certified default dataset is handled separately because it also carries
a URI and build ID. This helper covers every logical dataset entry under
``datasets``; notably the US long-term bundle stores one entry per year with
both H5 and metadata-sidecar hashes.
"""
for path_reference in manifest_json.get("datasets", {}).values():
path = path_reference.get("path")
if not path:
continue
artifact = _release_artifact_by_path(release_manifest_json, path)
if artifact is None:
if "sha256" in path_reference or "metadata_sha256" in path_reference:
raise ValueError(
"Data release manifest is missing dataset artifact "
f"for existing pinned path {path!r}; refusing to leave "
"stale dataset hash pins in place."
)
continue
if artifact.get("path"):
path_reference["path"] = artifact["path"]
path = artifact["path"]
dataset_sha256 = artifact.get("sha256")
if dataset_sha256:
path_reference["sha256"] = dataset_sha256
elif "sha256" in path_reference:
raise ValueError(
"Data release manifest dataset artifact lacks sha256 "
f"for existing pinned path {path!r}; refusing to leave "
"stale dataset hash pin in place."
)

metadata_artifact = _release_artifact_by_path(
release_manifest_json,
_metadata_sidecar_path(path),
)
had_metadata_pin = "metadata_sha256" in path_reference
if metadata_artifact is None:
if had_metadata_pin:
raise ValueError(
"Data release manifest is missing metadata sidecar artifact "
f"for {path!r}; refusing to drop existing metadata hash pin."
)
path_reference.pop("metadata_sha256", None)
continue
metadata_sha256 = metadata_artifact.get("sha256")
if not metadata_sha256:
if had_metadata_pin:
raise ValueError(
"Data release manifest metadata sidecar artifact lacks sha256 "
f"for {path!r}; refusing to drop existing metadata hash pin."
)
path_reference.pop("metadata_sha256", None)
continue
path_reference["metadata_sha256"] = metadata_sha256


# ---------------------------------------------------------------------------
# Refresh result
# ---------------------------------------------------------------------------
Expand Down Expand Up @@ -281,6 +362,8 @@ def refresh_release_bundle(
*,
model_version: Optional[str] = None,
data_version: Optional[str] = None,
release_manifest_path: Optional[str] = None,
release_manifest_revision: Optional[str] = None,
update_pyproject: bool = True,
manifest_dir: Path = MANIFEST_DIR,
pyproject_path: Path = PYPROJECT,
Expand All @@ -293,6 +376,11 @@ def refresh_release_bundle(
If ``None``, keeps the existing pin.
data_version: New data-package version, e.g. ``"1.83.4"``. If
``None``, keeps the existing pin.
release_manifest_path: Optional explicit data release manifest path.
Needed for custom bundles whose path does not include the data
package version, such as CRFB long-run candidate releases.
release_manifest_revision: Optional HF revision to fetch the data
release manifest from before pinning the immutable repo commit.
update_pyproject: When True, also bumps the country extra in
``pyproject.toml`` to ``model_version``.
manifest_dir: Overridable for tests.
Expand Down Expand Up @@ -339,17 +427,27 @@ def refresh_release_bundle(
data_package_json = manifest_json["data_package"]
release_manifest_json = None
new_release_manifest_revision = None
new_release_manifest_path = data_package_json.get("release_manifest_path")
if new_data != old_data and new_release_manifest_path is not None:
new_release_manifest_path = _updated_release_manifest_path(
current_path=new_release_manifest_path,
old_data=old_data,
new_data=new_data,
)
new_release_manifest_path = release_manifest_path or data_package_json.get(
"release_manifest_path"
)
should_fetch_release_manifest = new_release_manifest_path is not None and (
new_data != old_data
or release_manifest_path is not None
or release_manifest_revision is not None
)
if should_fetch_release_manifest:
if release_manifest_path is None:
new_release_manifest_path = _updated_release_manifest_path(
current_path=new_release_manifest_path,
old_data=old_data,
new_data=new_data,
)
fetch_revision = release_manifest_revision or new_data
release_manifest_fetch = _fetch_data_release_manifest(
repo_id=repo_id,
release_manifest_path=new_release_manifest_path,
revision=new_data,
revision=fetch_revision,
allow_main_fallback=release_manifest_revision is None,
)
if release_manifest_fetch is None:
raise ValueError(
Expand Down Expand Up @@ -399,12 +497,16 @@ def refresh_release_bundle(
release_manifest_json is not None
and new_release_manifest_revision is not None
and dataset_repo_id == repo_id
and dataset_revision == new_data
and dataset_revision in {new_data, release_manifest_revision}
):
dataset_revision = new_release_manifest_revision

# Only hit HF if the data version actually changed.
if new_data != old_data:
release_manifest_override = (
release_manifest_path is not None or release_manifest_revision is not None
)

# Only hit HF if the data version or release manifest target changed.
if new_data != old_data or release_manifest_override:
new_dataset_sha256 = data_artifact_json.get("sha256") or _hf_dataset_sha256(
dataset_repo_id,
dataset_path,
Expand All @@ -420,7 +522,7 @@ def refresh_release_bundle(
manifest_json["model_package"]["sha256"] = new_wheel_sha256
manifest_json["model_package"]["wheel_url"] = new_wheel_url
data_package_json["version"] = new_data
if new_data != old_data:
if new_data != old_data or release_manifest_override:
if new_release_manifest_path is not None:
data_package_json["release_manifest_path"] = new_release_manifest_path
if new_release_manifest_revision is not None:
Expand Down Expand Up @@ -469,6 +571,10 @@ def refresh_release_bundle(
certification_json["compatibility_basis"] = (
"legacy_compatible_model_package"
)
_refresh_dataset_path_references_from_data_release(
manifest_json,
release_manifest_json,
)

manifest_path.write_text(
json.dumps(manifest_json, indent=2, sort_keys=False) + "\n"
Expand Down
Loading