diff --git a/changelog.d/longterm-release-refresh.fixed.md b/changelog.d/longterm-release-refresh.fixed.md new file mode 100644 index 00000000..3c16b9c9 --- /dev/null +++ b/changelog.d/longterm-release-refresh.fixed.md @@ -0,0 +1 @@ +Refresh US long-term dataset hash pins from data release manifests. diff --git a/scripts/refresh_release_bundle.py b/scripts/refresh_release_bundle.py index 2bbdea75..2bfac281 100644 --- a/scripts/refresh_release_bundle.py +++ b/scripts/refresh_release_bundle.py @@ -48,6 +48,20 @@ def main(argv: list[str] | None = None) -> int: "--data-version", help="New policyengine-{country}-data version (e.g. 1.83.4)", ) + parser.add_argument( + "--release-manifest-path", + help=( + "Override the data release manifest path, e.g. " + "releases/crfb-longrun-20260518/release_manifest.json" + ), + ) + parser.add_argument( + "--release-manifest-revision", + help=( + "HF revision to fetch the data release manifest from before " + "pinning the immutable repo commit." + ), + ) parser.add_argument( "--no-pyproject", action="store_true", @@ -67,6 +81,8 @@ def main(argv: list[str] | None = None) -> int: country=args.country, model_version=args.model_version, data_version=args.data_version, + release_manifest_path=args.release_manifest_path, + release_manifest_revision=args.release_manifest_revision, update_pyproject=not args.no_pyproject, ) print(result.summary()) diff --git a/src/policyengine/provenance/bundle.py b/src/policyengine/provenance/bundle.py index a8f5f476..c1def53c 100644 --- a/src/policyengine/provenance/bundle.py +++ b/src/policyengine/provenance/bundle.py @@ -184,6 +184,8 @@ def _fetch_data_release_manifest( repo_id: str, release_manifest_path: str, revision: str, + *, + allow_main_fallback: bool = True, ) -> Optional[_DataReleaseManifestFetch]: """Fetch a data release manifest from HF if one is available. @@ -192,9 +194,11 @@ def _fetch_data_release_manifest( dataset artifact directly. Data releases are stored under versioned paths, but the HF repository does - not necessarily create a matching git tag for each data version. Try the - version revision first for repositories that do publish tags, then fall - back to ``main`` and persist the immutable ``x-repo-commit`` header. + not necessarily create a matching git tag for each data version. For + inferred data-version revisions, try the version revision first for + repositories that do publish tags, then fall back to ``main`` and persist + the immutable ``x-repo-commit`` header. Explicit revisions do not get that + fallback because a typo or stale CRFB run ref should fail closed. """ headers = {"User-Agent": "policyengine.py"} token = os.environ.get("HUGGING_FACE_TOKEN") or os.environ.get("HF_TOKEN") @@ -202,7 +206,7 @@ def _fetch_data_release_manifest( headers["Authorization"] = f"Bearer {token}" revisions = [revision] - if revision != "main": + if allow_main_fallback and revision != "main": revisions.append("main") for candidate in revisions: @@ -234,6 +238,83 @@ def _updated_release_manifest_path( return current_path +def _release_artifact_by_path( + release_manifest_json: dict, + path: str, +) -> Optional[dict]: + artifacts = release_manifest_json.get("artifacts", {}) + for artifact in artifacts.values(): + if artifact.get("path") == path: + return artifact + return None + + +def _metadata_sidecar_path(path: str) -> str: + return f"{path}.metadata.json" + + +def _refresh_dataset_path_references_from_data_release( + manifest_json: dict, + release_manifest_json: dict, +) -> None: + """Refresh bundled dataset hash pins from a data release manifest. + + The certified default dataset is handled separately because it also carries + a URI and build ID. This helper covers every logical dataset entry under + ``datasets``; notably the US long-term bundle stores one entry per year with + both H5 and metadata-sidecar hashes. + """ + for path_reference in manifest_json.get("datasets", {}).values(): + path = path_reference.get("path") + if not path: + continue + artifact = _release_artifact_by_path(release_manifest_json, path) + if artifact is None: + if "sha256" in path_reference or "metadata_sha256" in path_reference: + raise ValueError( + "Data release manifest is missing dataset artifact " + f"for existing pinned path {path!r}; refusing to leave " + "stale dataset hash pins in place." + ) + continue + if artifact.get("path"): + path_reference["path"] = artifact["path"] + path = artifact["path"] + dataset_sha256 = artifact.get("sha256") + if dataset_sha256: + path_reference["sha256"] = dataset_sha256 + elif "sha256" in path_reference: + raise ValueError( + "Data release manifest dataset artifact lacks sha256 " + f"for existing pinned path {path!r}; refusing to leave " + "stale dataset hash pin in place." + ) + + metadata_artifact = _release_artifact_by_path( + release_manifest_json, + _metadata_sidecar_path(path), + ) + had_metadata_pin = "metadata_sha256" in path_reference + if metadata_artifact is None: + if had_metadata_pin: + raise ValueError( + "Data release manifest is missing metadata sidecar artifact " + f"for {path!r}; refusing to drop existing metadata hash pin." + ) + path_reference.pop("metadata_sha256", None) + continue + metadata_sha256 = metadata_artifact.get("sha256") + if not metadata_sha256: + if had_metadata_pin: + raise ValueError( + "Data release manifest metadata sidecar artifact lacks sha256 " + f"for {path!r}; refusing to drop existing metadata hash pin." + ) + path_reference.pop("metadata_sha256", None) + continue + path_reference["metadata_sha256"] = metadata_sha256 + + # --------------------------------------------------------------------------- # Refresh result # --------------------------------------------------------------------------- @@ -281,6 +362,8 @@ def refresh_release_bundle( *, model_version: Optional[str] = None, data_version: Optional[str] = None, + release_manifest_path: Optional[str] = None, + release_manifest_revision: Optional[str] = None, update_pyproject: bool = True, manifest_dir: Path = MANIFEST_DIR, pyproject_path: Path = PYPROJECT, @@ -293,6 +376,11 @@ def refresh_release_bundle( If ``None``, keeps the existing pin. data_version: New data-package version, e.g. ``"1.83.4"``. If ``None``, keeps the existing pin. + release_manifest_path: Optional explicit data release manifest path. + Needed for custom bundles whose path does not include the data + package version, such as CRFB long-run candidate releases. + release_manifest_revision: Optional HF revision to fetch the data + release manifest from before pinning the immutable repo commit. update_pyproject: When True, also bumps the country extra in ``pyproject.toml`` to ``model_version``. manifest_dir: Overridable for tests. @@ -339,17 +427,27 @@ def refresh_release_bundle( data_package_json = manifest_json["data_package"] release_manifest_json = None new_release_manifest_revision = None - new_release_manifest_path = data_package_json.get("release_manifest_path") - if new_data != old_data and new_release_manifest_path is not None: - new_release_manifest_path = _updated_release_manifest_path( - current_path=new_release_manifest_path, - old_data=old_data, - new_data=new_data, - ) + new_release_manifest_path = release_manifest_path or data_package_json.get( + "release_manifest_path" + ) + should_fetch_release_manifest = new_release_manifest_path is not None and ( + new_data != old_data + or release_manifest_path is not None + or release_manifest_revision is not None + ) + if should_fetch_release_manifest: + if release_manifest_path is None: + new_release_manifest_path = _updated_release_manifest_path( + current_path=new_release_manifest_path, + old_data=old_data, + new_data=new_data, + ) + fetch_revision = release_manifest_revision or new_data release_manifest_fetch = _fetch_data_release_manifest( repo_id=repo_id, release_manifest_path=new_release_manifest_path, - revision=new_data, + revision=fetch_revision, + allow_main_fallback=release_manifest_revision is None, ) if release_manifest_fetch is None: raise ValueError( @@ -399,12 +497,16 @@ def refresh_release_bundle( release_manifest_json is not None and new_release_manifest_revision is not None and dataset_repo_id == repo_id - and dataset_revision == new_data + and dataset_revision in {new_data, release_manifest_revision} ): dataset_revision = new_release_manifest_revision - # Only hit HF if the data version actually changed. - if new_data != old_data: + release_manifest_override = ( + release_manifest_path is not None or release_manifest_revision is not None + ) + + # Only hit HF if the data version or release manifest target changed. + if new_data != old_data or release_manifest_override: new_dataset_sha256 = data_artifact_json.get("sha256") or _hf_dataset_sha256( dataset_repo_id, dataset_path, @@ -420,7 +522,7 @@ def refresh_release_bundle( manifest_json["model_package"]["sha256"] = new_wheel_sha256 manifest_json["model_package"]["wheel_url"] = new_wheel_url data_package_json["version"] = new_data - if new_data != old_data: + if new_data != old_data or release_manifest_override: if new_release_manifest_path is not None: data_package_json["release_manifest_path"] = new_release_manifest_path if new_release_manifest_revision is not None: @@ -469,6 +571,10 @@ def refresh_release_bundle( certification_json["compatibility_basis"] = ( "legacy_compatible_model_package" ) + _refresh_dataset_path_references_from_data_release( + manifest_json, + release_manifest_json, + ) manifest_path.write_text( json.dumps(manifest_json, indent=2, sort_keys=False) + "\n" diff --git a/tests/test_bundle_refresh.py b/tests/test_bundle_refresh.py index ae8f2305..68d61f9b 100644 --- a/tests/test_bundle_refresh.py +++ b/tests/test_bundle_refresh.py @@ -85,8 +85,20 @@ def _data_release_manifest_response( model_version: str = "1.653.3", data_version: str = "1.83.4", dataset_sha256: str = "e" * 64, + extra_artifacts: dict | None = None, headers: dict | None = None, ): + artifacts = { + "enhanced_cps_2024": { + "kind": "microdata", + "path": "enhanced_cps_2024.h5", + "repo_id": "policyengine/policyengine-us-data", + "revision": data_version, + "sha256": dataset_sha256, + } + } + if extra_artifacts: + artifacts.update(extra_artifacts) payload = { "schema_version": 1, "data_package": { @@ -102,15 +114,7 @@ def _data_release_manifest_response( "data_build_fingerprint": "sha256:fingerprint", }, }, - "artifacts": { - "enhanced_cps_2024": { - "kind": "microdata", - "path": "enhanced_cps_2024.h5", - "repo_id": "policyengine/policyengine-us-data", - "revision": data_version, - "sha256": dataset_sha256, - } - }, + "artifacts": artifacts, } return _FakeHFResponse( json.dumps(payload).encode(), @@ -427,6 +431,259 @@ def fake_urlopen(request, *args, **kwargs): ) +def test__custom_release_manifest_refreshes_long_term_dataset_hashes( + sandbox, +) -> None: + manifest_path = sandbox["manifest_dir"] / "us.json" + manifest = json.loads(manifest_path.read_text()) + manifest["data_package"]["release_manifest_path"] = ( + "releases/crfb-longrun-old/release_manifest.json" + ) + manifest["data_package"]["release_manifest_revision"] = "crfb-longrun-old" + manifest["datasets"]["long_term_cps_2100"] = { + "path": "long_term/2100.h5", + "sha256": "1" * 64, + "metadata_sha256": "2" * 64, + } + manifest_path.write_text(json.dumps(manifest, indent=2)) + + def fake_urlopen(request, *args, **kwargs): + url = request.full_url + if ( + "/resolve/crfb-longrun-new/releases/crfb-longrun-new/release_manifest.json" + in url + ): + return _data_release_manifest_response( + data_version="1.83.4", + extra_artifacts={ + "long_term/2100": { + "kind": "microdata", + "path": "long_term/2100.h5", + "repo_id": "policyengine/policyengine-us-data", + "revision": "1.83.4", + "sha256": "3" * 64, + }, + "long_term/2100.h5.metadata": { + "kind": "auxiliary", + "path": "long_term/2100.h5.metadata.json", + "repo_id": "policyengine/policyengine-us-data", + "revision": "1.83.4", + "sha256": "4" * 64, + }, + }, + ) + raise AssertionError(f"Unexpected URL fetched: {url}") + + with patch("policyengine.provenance.bundle.urlopen", side_effect=fake_urlopen): + refresh_release_bundle( + country="us", + data_version="1.83.4", + release_manifest_path="releases/crfb-longrun-new/release_manifest.json", + release_manifest_revision="crfb-longrun-new", + manifest_dir=sandbox["manifest_dir"], + pyproject_path=sandbox["pyproject_path"], + ) + + written = json.loads((sandbox["manifest_dir"] / "us.json").read_text()) + assert ( + written["data_package"]["release_manifest_path"] + == "releases/crfb-longrun-new/release_manifest.json" + ) + assert ( + written["data_package"]["release_manifest_revision"] + == "release-manifest-commit-sha" + ) + assert written["datasets"]["long_term_cps_2100"]["sha256"] == "3" * 64 + assert written["datasets"]["long_term_cps_2100"]["metadata_sha256"] == "4" * 64 + assert ( + written["certified_data_artifact"]["uri"] + == "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5@release-manifest-commit-sha" + ) + + +def test__custom_release_manifest_requires_existing_long_term_metadata_sidecar( + sandbox, +) -> None: + manifest_path = sandbox["manifest_dir"] / "us.json" + manifest = json.loads(manifest_path.read_text()) + manifest["data_package"]["release_manifest_path"] = ( + "releases/crfb-longrun-old/release_manifest.json" + ) + manifest["data_package"]["release_manifest_revision"] = "crfb-longrun-old" + manifest["datasets"]["long_term_cps_2100"] = { + "path": "long_term/2100.h5", + "sha256": "1" * 64, + "metadata_sha256": "2" * 64, + } + manifest_path.write_text(json.dumps(manifest, indent=2)) + + def fake_urlopen(request, *args, **kwargs): + url = request.full_url + if ( + "/resolve/crfb-longrun-new/releases/crfb-longrun-new/release_manifest.json" + in url + ): + return _data_release_manifest_response( + data_version="1.83.4", + extra_artifacts={ + "long_term/2100": { + "kind": "microdata", + "path": "long_term/2100.h5", + "repo_id": "policyengine/policyengine-us-data", + "revision": "1.83.4", + "sha256": "3" * 64, + }, + }, + ) + raise AssertionError(f"Unexpected URL fetched: {url}") + + with patch("policyengine.provenance.bundle.urlopen", side_effect=fake_urlopen): + with pytest.raises( + ValueError, + match="missing metadata sidecar artifact", + ): + refresh_release_bundle( + country="us", + data_version="1.83.4", + release_manifest_path="releases/crfb-longrun-new/release_manifest.json", + release_manifest_revision="crfb-longrun-new", + manifest_dir=sandbox["manifest_dir"], + pyproject_path=sandbox["pyproject_path"], + ) + + +def test__custom_release_manifest_requires_existing_long_term_dataset_artifact( + sandbox, +) -> None: + manifest_path = sandbox["manifest_dir"] / "us.json" + manifest = json.loads(manifest_path.read_text()) + manifest["data_package"]["release_manifest_path"] = ( + "releases/crfb-longrun-old/release_manifest.json" + ) + manifest["data_package"]["release_manifest_revision"] = "crfb-longrun-old" + manifest["datasets"]["long_term_cps_2100"] = { + "path": "long_term/2100.h5", + "sha256": "1" * 64, + "metadata_sha256": "2" * 64, + } + manifest_path.write_text(json.dumps(manifest, indent=2)) + + def fake_urlopen(request, *args, **kwargs): + url = request.full_url + if ( + "/resolve/crfb-longrun-new/releases/crfb-longrun-new/release_manifest.json" + in url + ): + return _data_release_manifest_response(data_version="1.83.4") + raise AssertionError(f"Unexpected URL fetched: {url}") + + with patch("policyengine.provenance.bundle.urlopen", side_effect=fake_urlopen): + with pytest.raises( + ValueError, + match="missing dataset artifact", + ): + refresh_release_bundle( + country="us", + data_version="1.83.4", + release_manifest_path="releases/crfb-longrun-new/release_manifest.json", + release_manifest_revision="crfb-longrun-new", + manifest_dir=sandbox["manifest_dir"], + pyproject_path=sandbox["pyproject_path"], + ) + + +def test__custom_release_manifest_requires_existing_long_term_dataset_sha( + sandbox, +) -> None: + manifest_path = sandbox["manifest_dir"] / "us.json" + manifest = json.loads(manifest_path.read_text()) + manifest["data_package"]["release_manifest_path"] = ( + "releases/crfb-longrun-old/release_manifest.json" + ) + manifest["data_package"]["release_manifest_revision"] = "crfb-longrun-old" + manifest["datasets"]["long_term_cps_2100"] = { + "path": "long_term/2100.h5", + "sha256": "1" * 64, + "metadata_sha256": "2" * 64, + } + manifest_path.write_text(json.dumps(manifest, indent=2)) + + def fake_urlopen(request, *args, **kwargs): + url = request.full_url + if ( + "/resolve/crfb-longrun-new/releases/crfb-longrun-new/release_manifest.json" + in url + ): + return _data_release_manifest_response( + data_version="1.83.4", + extra_artifacts={ + "long_term/2100": { + "kind": "microdata", + "path": "long_term/2100.h5", + "repo_id": "policyengine/policyengine-us-data", + "revision": "1.83.4", + }, + "long_term/2100.h5.metadata": { + "kind": "auxiliary", + "path": "long_term/2100.h5.metadata.json", + "repo_id": "policyengine/policyengine-us-data", + "revision": "1.83.4", + "sha256": "4" * 64, + }, + }, + ) + raise AssertionError(f"Unexpected URL fetched: {url}") + + with patch("policyengine.provenance.bundle.urlopen", side_effect=fake_urlopen): + with pytest.raises( + ValueError, + match="dataset artifact lacks sha256", + ): + refresh_release_bundle( + country="us", + data_version="1.83.4", + release_manifest_path="releases/crfb-longrun-new/release_manifest.json", + release_manifest_revision="crfb-longrun-new", + manifest_dir=sandbox["manifest_dir"], + pyproject_path=sandbox["pyproject_path"], + ) + + +def test__explicit_release_manifest_revision_does_not_fallback_to_main( + sandbox, +) -> None: + seen_urls = [] + + def fake_urlopen(request, *args, **kwargs): + url = request.full_url + seen_urls.append(url) + if ( + "/resolve/bad-crfb-ref/releases/crfb-longrun-new/release_manifest.json" + in url + ): + raise HTTPError(url, 404, "Not Found", hdrs=None, fp=None) + if "/resolve/main/releases/crfb-longrun-new/release_manifest.json" in url: + return _data_release_manifest_response(data_version="1.83.4") + raise AssertionError(f"Unexpected URL fetched: {url}") + + with patch("policyengine.provenance.bundle.urlopen", side_effect=fake_urlopen): + with pytest.raises( + ValueError, + match="Could not fetch data release manifest", + ): + refresh_release_bundle( + country="us", + data_version="1.83.4", + release_manifest_path="releases/crfb-longrun-new/release_manifest.json", + release_manifest_revision="bad-crfb-ref", + manifest_dir=sandbox["manifest_dir"], + pyproject_path=sandbox["pyproject_path"], + ) + + assert any("/resolve/bad-crfb-ref/" in url for url in seen_urls) + assert not any("/resolve/main/" in url for url in seen_urls) + + def test__missing_release_manifest_metadata_raises(sandbox) -> None: def fake_urlopen(request, *args, **kwargs): url = request.full_url