From 0f98d6ed67609bcaecb21d33097111741cc59c76 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sun, 17 May 2026 16:22:31 -0400 Subject: [PATCH 1/2] Add US long-run bundle manifest --- pyproject.toml | 8 +- .../data/release_manifests/us.json | 397 +++++++++++++++++- .../release_manifests/us.trace.tro.jsonld | 48 +-- src/policyengine/provenance/manifest.py | 24 ++ .../tax_benefit_models/us/__init__.py | 2 + .../tax_benefit_models/us/datasets.py | 354 ++++++++++++++-- tests/test_models.py | 2 +- tests/test_release_manifests.py | 78 ++-- tests/test_us_long_term_datasets.py | 215 ++++++++++ uv.lock | 16 +- 10 files changed, 1015 insertions(+), 129 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 7b19bdff..22731f42 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,8 +46,8 @@ uk = [ "policyengine-uk==2.88.14", ] us = [ - "policyengine_core>=3.26.1", - "policyengine-us==1.691.10", + "policyengine_core==3.26.1", + "policyengine-us==1.691.12", ] dev = [ "pytest", @@ -60,9 +60,9 @@ dev = [ "plotly>=5.0.0", "pytest-asyncio>=0.26.0", "ruff>=0.9.0", - "policyengine_core>=3.26.1", + "policyengine_core==3.26.1", "policyengine-uk==2.88.14", - "policyengine-us==1.691.10", + "policyengine-us==1.691.12", "towncrier>=24.8.0", "mypy>=1.11.0", "pytest-cov>=5.0.0", diff --git a/src/policyengine/data/release_manifests/us.json b/src/policyengine/data/release_manifests/us.json index b21fcc55..05315263 100644 --- a/src/policyengine/data/release_manifests/us.json +++ b/src/policyengine/data/release_manifests/us.json @@ -5,16 +5,16 @@ "policyengine_version": "4.5.1", "model_package": { "name": "policyengine-us", - "version": "1.691.10", - "sha256": "a2c8365f152c2ec3332e996c4003d560b433b6c204bdfcc73bf094a0ae49f10e", - "wheel_url": "https://files.pythonhosted.org/packages/a7/a3/53630e30285cb441cc96e5264621b1b9e5af3b648ccd176b376f0e03a458/policyengine_us-1.691.10-py3-none-any.whl" + "version": "1.691.12", + "sha256": "ef43482bd8c6cc16f8f1d4050423f5dc1d045af15931f5d1b089715a31c839d2", + "wheel_url": "https://files.pythonhosted.org/packages/83/6f/b605fc1d8e06e377ae50870dc44a28cfe6562f0032e36dd53a5ef49472db/policyengine_us-1.691.12-py3-none-any.whl" }, "data_package": { "name": "policyengine-us-data", "version": "1.115.3", "repo_id": "policyengine/policyengine-us-data", - "release_manifest_path": "releases/1.115.3/release_manifest.json", - "release_manifest_revision": "69fc39a7fece4c49ba87291e598e76b40568cc5d" + "release_manifest_path": "releases/crfb-longrun-20260517/release_manifest.json", + "release_manifest_revision": "crfb-longrun-20260517" }, "certified_data_artifact": { "data_package": { @@ -27,17 +27,392 @@ "sha256": "96965b0d9931d36beb29486b25fffee8b630e3b62c9376e7265976e02f7ab3ca" }, "certification": { - "compatibility_basis": "exact_build_model_version", - "data_build_id": "policyengine-us-data-1.115.3", - "built_with_model_version": "1.691.10", - "certified_for_model_version": "1.691.10", - "certified_by": "policyengine.py bundled manifest", - "data_build_fingerprint": "sha256:b80625974285a9cff23bc2436fc46df98477997d50aea61e73b7f137b5121584" + "compatibility_basis": "candidate_long_term_bundle", + "data_build_id": "policyengine-us-data-crfb-longrun-20260517", + "built_with_model_version": "1.691.12", + "certified_for_model_version": "1.691.12", + "certified_by": "policyengine.py candidate long-term manifest", + "data_build_fingerprint": "sha256:ae675b873404ec1fdc461e056a02958816fd0f8ab08ba5883fb05c4cf40f5b49" }, "default_dataset": "enhanced_cps_2024", "datasets": { "enhanced_cps_2024": { "path": "enhanced_cps_2024.h5" + }, + "long_term_cps_2026": { + "path": "long_term/2026.h5", + "sha256": "05d0db5d93d42716e6bef84610f582ad0ad79d2d45bf5b2f8a8f0e448fd20eb5", + "metadata_sha256": "8df313523f87f4e3224d25c47d63e4f709778df017ff4a8a3cb9389e05d029db" + }, + "long_term_cps_2027": { + "path": "long_term/2027.h5", + "sha256": "659603f0a990e1ecea9938d6bae71afcfa82b5ecf2a4593c4431e5797ad1ac47", + "metadata_sha256": "3b6a406f850b4af3ba6501fee77028303acab4195625475f0434b9407f7d91f5" + }, + "long_term_cps_2028": { + "path": "long_term/2028.h5", + "sha256": "c1319b1656d8944a98065de5ff5a175e75db27004666b78ef4f22ebe1a4da5ec", + "metadata_sha256": "dada54cade37ceaa2f48d36f1353ef6043fb9edfa66545025a6f5223e23322e9" + }, + "long_term_cps_2029": { + "path": "long_term/2029.h5", + "sha256": "157cb62502fe336f0fb4c103a24fd5c2cdfa70042d5578a9a5eb875d8113d12b", + "metadata_sha256": "d178b74fcc95e40e690caa44c5a6e84fdf0a8a564eb1a415295417cfa87d0979" + }, + "long_term_cps_2030": { + "path": "long_term/2030.h5", + "sha256": "7806711e4ffee40736df6cd3cc4e1ddaccd14b4a85eafac558a607cc3d3661a2", + "metadata_sha256": "3273ed563bbc49e628e858b24ccf0274e4f60f970860049331d33e9f0ee3d5ac" + }, + "long_term_cps_2031": { + "path": "long_term/2031.h5", + "sha256": "760970322807c4cc51b8c7bf2ea3f1cd82e3d93d087d4562eb6eb337183739ac", + "metadata_sha256": "80641ec32fdba8d4feeb49c603859018a4eb0cbf7ee63cabe058ed5f94aa1ab0" + }, + "long_term_cps_2032": { + "path": "long_term/2032.h5", + "sha256": "b0037f3f742a5f26faba9b8f6eee15ef6d66669ff62a318c744afe1d7a7e1d6d", + "metadata_sha256": "9346d0bfd227784686a7f22066f4a33d9a9b38757e6fb8d627d22b87a45a7774" + }, + "long_term_cps_2033": { + "path": "long_term/2033.h5", + "sha256": "6ba7ba0f44342e603d58772795546caa38be00fe9faf0e9b94e462b371ec73a2", + "metadata_sha256": "6b21670d7f4319c3709f990f98f938815eb634ebf7597b058142e84e1ac99177" + }, + "long_term_cps_2034": { + "path": "long_term/2034.h5", + "sha256": "bd64dcc3fdc69659f5cd6a339f127a8812df1ca72240623d1e9a57bc801b8740", + "metadata_sha256": "aca31531c8ebc971274ba2593d7dd43d8244eb3f09b14bcd7a0ee6b04331cb38" + }, + "long_term_cps_2035": { + "path": "long_term/2035.h5", + "sha256": "0b898849c85dae33b81e7db8c7b5dc979b7421afb7e2355b2fc405e6ed8e0d6e", + "metadata_sha256": "39f8067325b997d008404824271b4d2a9f132a9465079f031122747c98b2ab35" + }, + "long_term_cps_2036": { + "path": "long_term/2036.h5", + "sha256": "a72d21f918f34e9ef82996b91db5435013a1549e2150feeb6f31c3e6ad35bcee", + "metadata_sha256": "ba0bea04ec2ac9e07589321d4f09efcf4072cc10d331f789984abca11ca36ead" + }, + "long_term_cps_2037": { + "path": "long_term/2037.h5", + "sha256": "07c8d3adc71f085c2c2c8836f333ce349e73e41c0855dc6019dc4174aa4f7615", + "metadata_sha256": "accdfa1a3f1668b5d8dd883fa91289ca2aec151f71c8b3775dc573cbde5f476e" + }, + "long_term_cps_2038": { + "path": "long_term/2038.h5", + "sha256": "14e4012d4c17848b2a1f2bb64f08ecd5014a0dcf27e5aa980f7a25ce8da2c534", + "metadata_sha256": "bad7815e399ff84723d3f856b7a5d0903301b5d8d23cd24b5b9f4cc543f7eb8b" + }, + "long_term_cps_2039": { + "path": "long_term/2039.h5", + "sha256": "1e1542296890e5f373b7cf14ca3297a63354bd586502ccf5a2793bf129b2f200", + "metadata_sha256": "e45fa3a9ab1dd3a1d513723b5e0512fb081a325872bd0fbf5e60b5664288c9c2" + }, + "long_term_cps_2040": { + "path": "long_term/2040.h5", + "sha256": "0ae3cfdd45bc41bb537b9335c8614e841e166a215846e90c1f641932c2f9ef13", + "metadata_sha256": "a7d49aaefc5f1de8f3f1ee511d971e3ea54fb12f8d0b9c138dd5d251730836e1" + }, + "long_term_cps_2041": { + "path": "long_term/2041.h5", + "sha256": "55a93b54dbcfab23cacd62c3143c6c166f96dac1e2f2910a84cd8447012cf70c", + "metadata_sha256": "763a5942de34d7deb9efd3e81e04d1636fc43154512e32c36a3c0e886e6ea167" + }, + "long_term_cps_2042": { + "path": "long_term/2042.h5", + "sha256": "2408ca6f1089d27293d833994301ffa79f282258a032b005f330057d4464a0f1", + "metadata_sha256": "5337e90f557e3693a60798b19368a23b1a8a6ceea730decc08af27d53ff90c2f" + }, + "long_term_cps_2043": { + "path": "long_term/2043.h5", + "sha256": "ac3542c30801cc250c6a55f0389f300e4c75e5b20200882603251819af69e8d0", + "metadata_sha256": "cf7f7459271c9828825381f087beffbd31bfc9f3f52c125edbcb06c28187fc11" + }, + "long_term_cps_2044": { + "path": "long_term/2044.h5", + "sha256": "8f03f2d814fb808f03d9d2ec6ec013b1bb118bc3ad31e74b72f427774680e04e", + "metadata_sha256": "bdeeb7fce41c3d6bb06e666b2fa0356902913aa9095134ec7cadb2bbbcaf1340" + }, + "long_term_cps_2045": { + "path": "long_term/2045.h5", + "sha256": "8ce3c5357f197f14f03b06b0506ed24696bb2f6d9891754b19adb0f5985b9c2e", + "metadata_sha256": "f611b2b0e93b6cb33aadc65fe5de47f6ef638cbb5d396c3f7eff7f93c70d7536" + }, + "long_term_cps_2046": { + "path": "long_term/2046.h5", + "sha256": "06821f100b2337158ef37dd1a76cd286d636c37d19337d76848ba8a77f0a1827", + "metadata_sha256": "dbb9b4d44b593db1691820e2df4ffb7d9ba32381e370779b58519a81d2be2713" + }, + "long_term_cps_2047": { + "path": "long_term/2047.h5", + "sha256": "bf7024d14d88efa8d022ca12b1396165936ba033621efc8245f00ff903a17e1a", + "metadata_sha256": "3f56b9c0208a40a0055130b298d405da61f0c406cc25635c1df319505c1a6e75" + }, + "long_term_cps_2048": { + "path": "long_term/2048.h5", + "sha256": "81486267e7af76c1158adf0d9b3236f8589cf9539590e2d54afc559bc5fd9126", + "metadata_sha256": "dafd43f6c50dbe018ba085214f3590db63764f35a7b09f3744f2d70c3dd8c896" + }, + "long_term_cps_2049": { + "path": "long_term/2049.h5", + "sha256": "c3554953d5ce8e63468eac00f1de6925090d7705a11b0bd5ac5ae04a3b4ce3d0", + "metadata_sha256": "28d07cd2ba2122bb9dc7d0679fede54157342b2954ae2cdcbc2bae8608bff823" + }, + "long_term_cps_2050": { + "path": "long_term/2050.h5", + "sha256": "71eb52e488ff028f7219427c777cb1e05cc8df8911173747eb4cf39bb0e77b97", + "metadata_sha256": "5a44ed3377d3d088a61ad117d66e391d6a39c486c20d03cf122916b7bde6b809" + }, + "long_term_cps_2051": { + "path": "long_term/2051.h5", + "sha256": "2c9fe16ea725f7db52824eaa214c97171f6e0a9a245c9b56e71d0b195fa05c67", + "metadata_sha256": "a5657f928fa8454791645e05dd3052483be67702c5acff82beae33f3154b2727" + }, + "long_term_cps_2052": { + "path": "long_term/2052.h5", + "sha256": "9f5c1623e005de63d995a5a039a1b497b62d6d57f6e4fe62da42dc090f84f8c3", + "metadata_sha256": "b93ead6c5910a0c7d75f383c35a7c5d2276648b1e41cc8dff7679c352e89ea09" + }, + "long_term_cps_2053": { + "path": "long_term/2053.h5", + "sha256": "8020e7e09f2635d55823ef043269c0642a7882558512c4194fa8f96d36ac2aec", + "metadata_sha256": "9b2a9355d502eaa60e36957dccf91bd61bb2be260b6f5f705f993d08299dd1d5" + }, + "long_term_cps_2054": { + "path": "long_term/2054.h5", + "sha256": "860dd33f65d2ecb1ff7bb8604b760e65c4d711b307b58750733cc8da010936ac", + "metadata_sha256": "3b3c3d783cb5313529604bf44d82ea58065381898990a2bc6f4489a6e6a12d01" + }, + "long_term_cps_2055": { + "path": "long_term/2055.h5", + "sha256": "35efe4a2968a33ed5c00a70d856223cbc40b8e869a2119e1effa47ab381c2f25", + "metadata_sha256": "8eb21d39ce9d57d36dc082199d8b1e852f61b95da60baa0afd1c9a30c3a9929a" + }, + "long_term_cps_2056": { + "path": "long_term/2056.h5", + "sha256": "356671dda8f3547aad8f310e4774be0d142ed39c3995f7de22bc19db0ffbc1ee", + "metadata_sha256": "ac34719088c5a22fd24732fcb707d36f6bd9b90aac756ad357ce670de455a9e5" + }, + "long_term_cps_2057": { + "path": "long_term/2057.h5", + "sha256": "1c707ec0e200396af3eee63a0923cd98fc1dbc06259eb608505b7fbb7cbad3a7", + "metadata_sha256": "88dd0961c12484e3c0add615471f0c69985976373a70e5dd7690244325aff33d" + }, + "long_term_cps_2058": { + "path": "long_term/2058.h5", + "sha256": "3dd76c6936eb6b86ba5fd683f5cf30923d3dd299f0203700d368df05e5cfe395", + "metadata_sha256": "d5d0b8e5f1fe8e71f38c7f9d410b83d7c540bfb1958b0e604f18f6e0f2e52e45" + }, + "long_term_cps_2059": { + "path": "long_term/2059.h5", + "sha256": "28c82a32a69f5536a3a86e9ebfd929832063e98702e38f5e6a60dbb567c946ce", + "metadata_sha256": "e1aa9fd3369172b29ed281a514544910920a283e56a2df1ecb89866b8bfa24f3" + }, + "long_term_cps_2060": { + "path": "long_term/2060.h5", + "sha256": "e45e504eda62912acebde8617ea1d38a90477d74c84a4d7c8f6460db14bea0ac", + "metadata_sha256": "9acfc62ed8d54272217ff4e58071cfd0dbd39c3fbf9568aba1c423a235824f13" + }, + "long_term_cps_2061": { + "path": "long_term/2061.h5", + "sha256": "3abfdd454565f88d75598cebe4c7c7f646d40ff6e601d949620e47e503bd0839", + "metadata_sha256": "5846ba94c8d1189c8ef8f751ac1fbd694c437245d1352436bb1440cf9a933432" + }, + "long_term_cps_2062": { + "path": "long_term/2062.h5", + "sha256": "ad0d45178bd772f4c1d23da6f063264b586caa4736a562b47ac7df2838eac57e", + "metadata_sha256": "7b873c9139eb0649230cbc29ee79e719753d6e9e05b05f1389427e77e6320e8c" + }, + "long_term_cps_2063": { + "path": "long_term/2063.h5", + "sha256": "ce61c4966b6ff492293629e1864ae9b8ebf79306975350b0968e26a59e544b35", + "metadata_sha256": "078f921e7e74f1f0b80a33a036ab92b604cee6ff019264cc95501cac1246729e" + }, + "long_term_cps_2064": { + "path": "long_term/2064.h5", + "sha256": "e2a1da1eda1e9d3d53862984d9fe638284ecd925a8c9063df2bd2c12a4911047", + "metadata_sha256": "225653ed31deec9f414702844dd411624d620582d37f4aa7dad57f2a3f576b20" + }, + "long_term_cps_2065": { + "path": "long_term/2065.h5", + "sha256": "65199a8fb6c480ab6acb72f5da7a728f6f425b98b841b398a6f691897ad845f8", + "metadata_sha256": "a42a6a5fd52c371cc3a964fb3a0451d9903972b83fe50c41847e6be4c5d6adc5" + }, + "long_term_cps_2066": { + "path": "long_term/2066.h5", + "sha256": "24ce80fd8741eb1aa93cf61cf289a61519d4c58826d449f390d89f210f492b48", + "metadata_sha256": "db28ebeb2eeb79d8789e90d78e5576507832139d9e16de7f7feab1a796affb6a" + }, + "long_term_cps_2067": { + "path": "long_term/2067.h5", + "sha256": "846291a6111ffd6c482bbb6206677f3ea24fb79acc8301a828e9efb4f3037489", + "metadata_sha256": "ebbb5dddd9004f02a897123da761eeaffc887ed9087b6bf7e36b0a9e05cd61dd" + }, + "long_term_cps_2068": { + "path": "long_term/2068.h5", + "sha256": "680042e9b990b3b01b2eabc89644e5db289ba26030209a56cba8c12e4ac4b591", + "metadata_sha256": "3236b483747c213efc2b738305325c651518bfe6f7b563014ca76eee3286e39f" + }, + "long_term_cps_2069": { + "path": "long_term/2069.h5", + "sha256": "7ee5d927756d3e5617e349ec04640707c868a97472db57bacc926d66619dbc22", + "metadata_sha256": "e2fe2cf4d838107d22019ae24d39b112b1f1d53da3a21b6cd4456ccd165a6e24" + }, + "long_term_cps_2070": { + "path": "long_term/2070.h5", + "sha256": "3a600e297739f9bac6445b36841561209606cf096d9c5e082290448214aeae99", + "metadata_sha256": "335031ba516ee39c524729d2e0737a1f082d8dd67c04eca590f8c16047ed1837" + }, + "long_term_cps_2071": { + "path": "long_term/2071.h5", + "sha256": "fcd9ff7d1b087d63eb2ce86802408818f8a786bddc7c0ee5fd9b10d681431936", + "metadata_sha256": "f0a32b3f8bb33f03fe9e975d0ec1853b48b9639c5151d56addae6efb3082ef94" + }, + "long_term_cps_2072": { + "path": "long_term/2072.h5", + "sha256": "217bde3730fc6d466f1dcb3a36c4968a32d23b63d246d88946dc360cf56f41f7", + "metadata_sha256": "57144024f8fc72fc83e3ca62347c919db9c4f6e230ea9cb2c9b67e048cf578ff" + }, + "long_term_cps_2073": { + "path": "long_term/2073.h5", + "sha256": "e1a47bc346b5387c9f1a08af37ba34acb885de1227efbb7a0e1f0bd972819350", + "metadata_sha256": "eaf3de8b50e852213408ca078790284f3c5cf0ff05a3defc1b47588cde0bdf06" + }, + "long_term_cps_2074": { + "path": "long_term/2074.h5", + "sha256": "fddbca51883102bf70364726c3011ef486faab38b03e166193a1f879a1d6675c", + "metadata_sha256": "9b163ca69cca800395b50e4df5a3021de574eece0db03c79f3136ee37ada0cf5" + }, + "long_term_cps_2075": { + "path": "long_term/2075.h5", + "sha256": "40d3d44bdc6534e6f205038c00e28d4cb472878e2d881e2c7c11127f47d20f30", + "metadata_sha256": "6225b168a3edc8199045ac58789edf47209bb9968e436785e4ce6860b8dfdfe2" + }, + "long_term_cps_2076": { + "path": "long_term/2076.h5", + "sha256": "610660aeb4e3650cb21385e2fc791d9ad827f9ea265e61aac2a95bc11de7a874", + "metadata_sha256": "793ed33a309218c0dba586c07311f72a11a0dff0eece16fc0039c3bd5d478f8e" + }, + "long_term_cps_2077": { + "path": "long_term/2077.h5", + "sha256": "c6404751614c9d6f46af4fd7622908e9d02a4d2063aed6992826b00038f13f6a", + "metadata_sha256": "cefea22e3d2c45c3d969f3d5748de3be75bd0542a30a268c5e9b559944be25a1" + }, + "long_term_cps_2078": { + "path": "long_term/2078.h5", + "sha256": "f27118876d14cf91d42333b677735181a5457daab7cec908af291823b4ad4afb", + "metadata_sha256": "dd896194fb14fec65d6eff685e3a5526b37ee928264d6fb833e60bdba5cbccbe" + }, + "long_term_cps_2079": { + "path": "long_term/2079.h5", + "sha256": "1a285d77d1cdb4f8f6c1b13e974c8ee59b9f34425d58039d31cdcee2a7553171", + "metadata_sha256": "d07ea75362ee1946179a44e4ec266c761c014f11f43bdf1040f21cc50707eb89" + }, + "long_term_cps_2080": { + "path": "long_term/2080.h5", + "sha256": "f636b120eb8b20d2f72dd53a1dec2caa4bc07140963d4ae7e78e3b799351ebec", + "metadata_sha256": "fd84b48ea71b16d0b26b3d081109cdb19f1ee88636f64cff058e42aa08e0d4d7" + }, + "long_term_cps_2081": { + "path": "long_term/2081.h5", + "sha256": "fc79ab3836dc9c87bc7b84f77952fc1ad723912e7ea913d0fb163281ff220a4c", + "metadata_sha256": "2743ea548078bb4fb44ba2db4bbde5a3972a0b3d3c011862c3812f9f575e2a3d" + }, + "long_term_cps_2082": { + "path": "long_term/2082.h5", + "sha256": "0065fa242ca2e46bbaedef9a0b429a7acfb63ad1a908618ab589e52281cd3328", + "metadata_sha256": "01d08481929e0799f79a9aee4a340132a923f7b03cfad317eef471a723ffaa42" + }, + "long_term_cps_2083": { + "path": "long_term/2083.h5", + "sha256": "b4f4226d601c8b28ecab812d366feeb7c3c8d65cc29e80f2ce8db8752ab2a1ec", + "metadata_sha256": "e2a4dd53aa85fd54cab04cc4b9697f285e7541e9ee181c3bd2b71db91462ec85" + }, + "long_term_cps_2084": { + "path": "long_term/2084.h5", + "sha256": "a912b6bb87b5c87b0346233e3d2e99e5c1249790ae8cf7f5fa0843ee3a46a54e", + "metadata_sha256": "9d53595304292e5a1a33c55c0f048476125b7301f1a6f0251d1d768471a913df" + }, + "long_term_cps_2085": { + "path": "long_term/2085.h5", + "sha256": "c8095581dca95c02ccf8bb651e1850673f231607b6590795143d741f29e122fe", + "metadata_sha256": "62c58ff7b9beca1ab45b5abcd8c9d9dcc50491a51093385b5bf298e3c4c71621" + }, + "long_term_cps_2086": { + "path": "long_term/2086.h5", + "sha256": "eb0fdb5493bfaec2c87a2d6698ece62f84cb20377e2d37846de078c8cec4065f", + "metadata_sha256": "a821f1e5b438fc04a3226384855b68d8b60fd94513493611c5b6450ec6cf5934" + }, + "long_term_cps_2087": { + "path": "long_term/2087.h5", + "sha256": "02fefc2e524364d5f6c6aa8ed08f2b91256eaee41dfb7318398d97d969016d46", + "metadata_sha256": "36282ca77ac0beb99f4a0211c373ffa6d96c34a8ebfa7581406954bb88e8bbb3" + }, + "long_term_cps_2088": { + "path": "long_term/2088.h5", + "sha256": "d880dc4bed83d60c999f936ad7613ba48ac9a6a9c2f93307b4b3465f0d18b276", + "metadata_sha256": "c2b77f65a3ee5b10a77da803500b14e7fd70823402e5e265141bdf43953332f6" + }, + "long_term_cps_2089": { + "path": "long_term/2089.h5", + "sha256": "b706332ea23919947767be6c5a40a68227624ec56751564439ed09b6140dc424", + "metadata_sha256": "a7dbc17d23ecde9371295b0b68b4e60ee37ab294519fe3e77593c5fabc27b79f" + }, + "long_term_cps_2090": { + "path": "long_term/2090.h5", + "sha256": "1c1575b79c8d6fd8c1e16d95538722d9f7159db04d0959c624bea70481c60aff", + "metadata_sha256": "5f57ef16b2a5869b0d8400fd66d99b7bd95957fb7dac0c139c58bcb30446ca1a" + }, + "long_term_cps_2091": { + "path": "long_term/2091.h5", + "sha256": "f97d591ac6e09cb6f17f10c6a9c3debf6386986425858f8855e4e8674df81337", + "metadata_sha256": "cd42df35fae5d567efca62720c759fedbaf32f1b39b5ec5eda5f25834c021695" + }, + "long_term_cps_2092": { + "path": "long_term/2092.h5", + "sha256": "8905054c7c8b7950d11613f42c4f5c04f96055cbb0944f5ecfb5d6e1a3bf33c1", + "metadata_sha256": "55fe9145eee21aeacc7c029de1946fc1afebd8e3e6a8ebb57b32037aa641f72d" + }, + "long_term_cps_2093": { + "path": "long_term/2093.h5", + "sha256": "693cca62d9b3c000f3b0740e63dc59b818d11d17d1e682f6378059a50f2f62e5", + "metadata_sha256": "7180ee6362da862551355dac6ca1f4a702227352bf63974a714e64302510b781" + }, + "long_term_cps_2094": { + "path": "long_term/2094.h5", + "sha256": "14a36c50245f2487bd2be8560b8b5224eae49a881c58df032f31b905eaf95cfc", + "metadata_sha256": "2d804f1ff8be3b18c7cf9ea5fde5c53f6c0c30b98e9c0c115068656aa54db9a6" + }, + "long_term_cps_2095": { + "path": "long_term/2095.h5", + "sha256": "81a2afd5579db9c00a65baafe346897fbb2f5354342c1c5585a2191d496b93ba", + "metadata_sha256": "bb37a5d5baf22afed81282dc1ab89f58afb5af8773f4c5c4572f101b780868c6" + }, + "long_term_cps_2096": { + "path": "long_term/2096.h5", + "sha256": "49e16bda94832ed824e58c96f45e678c14e30165b48134e5544d7b0884feede3", + "metadata_sha256": "ec95487601e4e40d57ff906d666fafafaf4492db2fd7891601ff92fbfd0eecbb" + }, + "long_term_cps_2097": { + "path": "long_term/2097.h5", + "sha256": "2f9fcb011aa2348d0e45738031f6f26752f02d3589bde88936d69f437e0b34aa", + "metadata_sha256": "03614ebe2d90b99f669acda4dff641f16ae669e5ff4de7419cf156251ed6e077" + }, + "long_term_cps_2098": { + "path": "long_term/2098.h5", + "sha256": "3c9061f845c417e67c2f747ed196b0ed30f04bd1a7354d50cd31f233b69c1e8e", + "metadata_sha256": "809a36b058a0f9359ab751279eaa056a26c1c31732b98a2ff60656d51493b201" + }, + "long_term_cps_2099": { + "path": "long_term/2099.h5", + "sha256": "aea59ce5cb3708f4cf44e9c4db8fd8d694d9740e3118d7de94dfb3bd4186cb37", + "metadata_sha256": "92336e7424570861ed5ddd207033f8269810343a306371182671dc34fdeb3439" + }, + "long_term_cps_2100": { + "path": "long_term/2100.h5", + "sha256": "e9012cd744527980a27691e815ce68836c97ef22c65a71170b48a3ea89bfc1b1", + "metadata_sha256": "c43c0e4fc095a4e467c14b5b965c9303a2339220133ac19bcfc45df928365116" } }, "region_datasets": { diff --git a/src/policyengine/data/release_manifests/us.trace.tro.jsonld b/src/policyengine/data/release_manifests/us.trace.tro.jsonld index 47c67c74..1810b963 100644 --- a/src/policyengine/data/release_manifests/us.trace.tro.jsonld +++ b/src/policyengine/data/release_manifests/us.trace.tro.jsonld @@ -17,7 +17,6 @@ "schema:name": "PolicyEngine", "schema:url": "https://policyengine.org" }, - "schema:dateCreated": "2026-05-16T04:38:05.331368Z", "schema:description": "TRACE TRO for certified runtime bundle us-4.5.1 covering the bundle manifest, the certified dataset artifact, the country model wheel, and the country data release manifest when it is available.", "schema:name": "policyengine us certified bundle TRO", "trov:createdWith": { @@ -39,14 +38,6 @@ }, "trov:hasLocation": "data/release_manifests/us.json" }, - { - "@id": "arrangement/1/location/data_release_manifest", - "@type": "trov:ArtifactLocation", - "trov:hasArtifact": { - "@id": "composition/1/artifact/data_release_manifest" - }, - "trov:hasLocation": "https://huggingface.co/policyengine/policyengine-us-data/resolve/69fc39a7fece4c49ba87291e598e76b40568cc5d/releases/1.115.3/release_manifest.json" - }, { "@id": "arrangement/1/location/dataset", "@type": "trov:ArtifactLocation", @@ -61,7 +52,7 @@ "trov:hasArtifact": { "@id": "composition/1/artifact/model_wheel" }, - "trov:hasLocation": "https://files.pythonhosted.org/packages/a7/a3/53630e30285cb441cc96e5264621b1b9e5af3b648ccd176b376f0e03a458/policyengine_us-1.691.10-py3-none-any.whl" + "trov:hasLocation": "https://files.pythonhosted.org/packages/83/6f/b605fc1d8e06e377ae50870dc44a28cfe6562f0032e36dd53a5ef49472db/policyengine_us-1.691.12-py3-none-any.whl" } ] } @@ -75,54 +66,43 @@ "@type": "trov:ResearchArtifact", "schema:name": "policyengine.py bundle manifest for us", "trov:mimeType": "application/json", - "trov:sha256": "55cd4433702834bacd1057d6c728ff5730f35e508f78821040be6c48dd304045" - }, - { - "@id": "composition/1/artifact/data_release_manifest", - "@type": "trov:ResearchArtifact", - "schema:name": "policyengine-us-data release manifest 1.115.3", - "trov:mimeType": "application/json", - "trov:sha256": "50fec0baf199cf7f5bc5a16d9956d7c8d0fca3e2097b69dbb7f26cbbcf9114a9" + "trov:sha256": "3229be76641f8c8d79b44604da04453cf8aa1b28169a4816a3ef818f6f93bfee" }, { "@id": "composition/1/artifact/dataset", "@type": "trov:ResearchArtifact", "schema:name": "enhanced_cps_2024", - "trov:mimeType": "application/x-hdf5", "trov:sha256": "96965b0d9931d36beb29486b25fffee8b630e3b62c9376e7265976e02f7ab3ca" }, { "@id": "composition/1/artifact/model_wheel", "@type": "trov:ResearchArtifact", - "schema:name": "policyengine-us==1.691.10 wheel", + "schema:name": "policyengine-us==1.691.12 wheel", "trov:mimeType": "application/zip", - "trov:sha256": "a2c8365f152c2ec3332e996c4003d560b433b6c204bdfcc73bf094a0ae49f10e" + "trov:sha256": "ef43482bd8c6cc16f8f1d4050423f5dc1d045af15931f5d1b089715a31c839d2" } ], "trov:hasFingerprint": { "@id": "composition/1/fingerprint", "@type": "trov:CompositionFingerprint", - "trov:sha256": "8fc5a40f13183c7c041b3dc6e76e92aa14ee959f300f39f434cdaca0f95b1617" + "trov:sha256": "c14a4009d824044cd386f08b93b31f6af4cc608619ee22d83314f481fddfd47b" } }, "trov:hasPerformance": { "@id": "trp/1", "@type": "trov:TransparentResearchPerformance", - "pe:builtWithModelVersion": "1.691.10", - "pe:certifiedBy": "policyengine.py bundled manifest", - "pe:certifiedForModelVersion": "1.691.10", - "pe:ciGitRef": "refs/heads/main", - "pe:ciGitSha": "4c052fa345868f1eb92757ee779a653a82c72b71", - "pe:ciRunUrl": "https://github.com/PolicyEngine/policyengine.py/actions/runs/25953758405", - "pe:compatibilityBasis": "exact_build_model_version", - "pe:dataBuildFingerprint": "sha256:b80625974285a9cff23bc2436fc46df98477997d50aea61e73b7f137b5121584", - "pe:dataBuildId": "policyengine-us-data-1.115.3", - "pe:emittedIn": "github-actions", - "rdfs:comment": "Certification of build policyengine-us-data-1.115.3 for policyengine-us 1.691.10.", + "pe:builtWithModelVersion": "1.691.12", + "pe:certifiedBy": "policyengine.py candidate long-term manifest", + "pe:certifiedForModelVersion": "1.691.12", + "pe:compatibilityBasis": "candidate_long_term_bundle", + "pe:dataBuildFingerprint": "sha256:ae675b873404ec1fdc461e056a02958816fd0f8ab08ba5883fb05c4cf40f5b49", + "pe:dataBuildId": "policyengine-us-data-crfb-longrun-20260517", + "pe:dataReleaseManifestStatus": "unavailable", + "pe:emittedIn": "local", + "rdfs:comment": "Certification of build policyengine-us-data-crfb-longrun-20260517 for policyengine-us 1.691.12.", "trov:accessedArrangement": { "@id": "arrangement/1" }, - "trov:startedAtTime": "2026-05-16T04:38:05.331368Z", "trov:wasConductedBy": { "@id": "trs" } diff --git a/src/policyengine/provenance/manifest.py b/src/policyengine/provenance/manifest.py index c3f2b48f..9332cc92 100644 --- a/src/policyengine/provenance/manifest.py +++ b/src/policyengine/provenance/manifest.py @@ -57,6 +57,8 @@ class DataBuildInfo(BaseModel): class ArtifactPathReference(BaseModel): path: str + sha256: Optional[str] = None + metadata_sha256: Optional[str] = None class ArtifactPathTemplate(BaseModel): @@ -502,6 +504,28 @@ def resolve_local_managed_dataset_source( _, _, path_in_repo = parts model_module_name, data_repo_name, data_package_name = local_hint + explicit_repo_roots = [] + country_env = f"POLICYENGINE_{country_id.upper()}_DATA_REPO" + for env_name in (country_env, "POLICYENGINE_LOCAL_DATA_REPO_ROOT"): + env_value = os.environ.get(env_name) + if env_value: + explicit_repo_roots.extend( + [ + Path(env_value).expanduser(), + Path(env_value).expanduser() / data_repo_name, + ] + ) + + for candidate_repo_root in explicit_repo_roots: + local_path = ( + candidate_repo_root + / data_package_name + / "storage" + / path_in_repo + ) + if local_path.exists(): + return str(local_path) + try: model_module = import_module(model_module_name) except ImportError: diff --git a/src/policyengine/tax_benefit_models/us/__init__.py b/src/policyengine/tax_benefit_models/us/__init__.py index 3bc605bd..1f901b06 100644 --- a/src/policyengine/tax_benefit_models/us/__init__.py +++ b/src/policyengine/tax_benefit_models/us/__init__.py @@ -38,6 +38,7 @@ ensure_datasets, load_datasets, load_long_term_datasets, + load_managed_long_term_datasets, validate_long_term_dataset_metadata, ) from .household import calculate_household @@ -63,6 +64,7 @@ "create_datasets", "load_datasets", "load_long_term_datasets", + "load_managed_long_term_datasets", "ensure_datasets", "validate_long_term_dataset_metadata", "PolicyEngineUS", diff --git a/src/policyengine/tax_benefit_models/us/datasets.py b/src/policyengine/tax_benefit_models/us/datasets.py index 6ba01b6f..38a04f9e 100644 --- a/src/policyengine/tax_benefit_models/us/datasets.py +++ b/src/policyengine/tax_benefit_models/us/datasets.py @@ -1,3 +1,5 @@ +import hashlib +import importlib.util import json import warnings from importlib import metadata as importlib_metadata @@ -12,7 +14,10 @@ from policyengine.core import Dataset, YearData from policyengine.provenance.manifest import ( dataset_logical_name, + get_release_manifest, resolve_dataset_reference, + resolve_local_managed_dataset_source, + resolve_managed_dataset_reference, ) @@ -553,9 +558,39 @@ def _runtime_policyengine_us_metadata() -> dict[str, Any]: result["direct_url"] = json.loads(direct_url_text) except json.JSONDecodeError: result["direct_url"] = {} + package_file = _runtime_policyengine_us_package_file() + if package_file is not None: + result["package_file_sha256"] = _sha256_file(package_file) + result["package_tree_sha256"] = _sha256_directory(package_file.parent) return result +def _runtime_policyengine_us_package_file() -> Optional[Path]: + spec = importlib.util.find_spec("policyengine_us") + if spec is None or spec.origin is None: + return None + path = Path(spec.origin) + return path if path.exists() else None + + +def _sha256_directory(path: Path) -> str: + digest = hashlib.sha256() + for file_path in sorted(path.rglob("*")): + if not file_path.is_file(): + continue + if "__pycache__" in file_path.parts or file_path.suffix in {".pyc", ".pyo"}: + continue + relative_path = file_path.relative_to(path).as_posix() + contents = file_path.read_bytes() + digest.update(relative_path.encode("utf-8")) + digest.update(b"\0") + digest.update(str(len(contents)).encode("utf-8")) + digest.update(b"\0") + digest.update(contents) + digest.update(b"\0") + return digest.hexdigest() + + def _validate_runtime_policyengine_us_match( metadata: dict, *, @@ -581,6 +616,15 @@ def _validate_runtime_policyengine_us_match( f"{metadata_git_sha!r}, but the installed runtime has " f"{runtime_git_sha!r}." ) + for label in ("package_tree_sha256", "package_file_sha256"): + metadata_hash = policyengine_us.get(label) + runtime_hash = runtime_policyengine_us.get(label) + if metadata_hash and runtime_hash != metadata_hash: + raise ValueError( + f"Long-term dataset {path} was built with policyengine-us " + f"{label} {metadata_hash!r}, but the installed runtime has " + f"{runtime_hash!r}." + ) def validate_long_term_dataset_metadata( @@ -737,6 +781,104 @@ def validate_long_term_dataset_metadata( _validate_runtime_policyengine_us_match(metadata, path=path) +def _long_term_dataset_key(dataset_name: str, year: int) -> str: + return f"{dataset_name}_{int(year)}" + + +def _build_long_term_dataset( + *, + path: Path, + year: int, + dataset_name: str, + metadata: dict, + metadata_path: Optional[Path], + dataset_uri: Optional[str] = None, +) -> PolicyEngineUSDataset: + dataset = PolicyEngineUSDataset( + id=_long_term_dataset_key(dataset_name, year), + name=f"{dataset_name}-{year}", + description=f"US long-term projected dataset for {year}", + filepath=str(path), + year=int(year), + metadata=metadata, + metadata_filepath=str(metadata_path) if metadata_path else None, + ) + if dataset_uri is not None: + dataset.metadata.setdefault("policyengine_bundle", {}) + dataset.metadata["policyengine_bundle"].update( + { + "managed_by": "policyengine.py", + "runtime_dataset": _long_term_dataset_key(dataset_name, year), + "runtime_dataset_uri": dataset_uri, + } + ) + return dataset + + +def _validate_loaded_long_term_metadata( + *, + metadata: dict, + metadata_path: Optional[Path], + path: Path, + year: int, + required_profile: Optional[str], + required_target_source: Optional[str], + required_tax_assumption: Optional[str], + required_support_augmentation_profile: Optional[str], + required_support_augmentation_target_year: Optional[int], + required_support_augmentation_target_year_strategy: Optional[str], + required_support_augmentation_blueprint_base_weight_scale: Optional[float], + require_support_augmentation_sanitize_clone_non_target_income: Optional[bool], + require_support_augmentation_sanitize_worker_non_target_income: Optional[bool], + minimum_calibration_quality: Optional[str], + require_validation_passed: bool, + required_policyengine_us_version: Optional[str], + required_policyengine_us_git_sha: Optional[str], + require_policyengine_us_clean_build: bool, + require_runtime_policyengine_us_match: bool, +) -> None: + if metadata_path is None: + return + validate_long_term_dataset_metadata( + metadata, + path=path, + year=year, + required_profile=required_profile, + required_target_source=required_target_source, + required_tax_assumption=required_tax_assumption, + required_support_augmentation_profile=required_support_augmentation_profile, + required_support_augmentation_target_year=( + required_support_augmentation_target_year + ), + required_support_augmentation_target_year_strategy=( + required_support_augmentation_target_year_strategy + ), + required_support_augmentation_blueprint_base_weight_scale=( + required_support_augmentation_blueprint_base_weight_scale + ), + require_support_augmentation_sanitize_clone_non_target_income=( + require_support_augmentation_sanitize_clone_non_target_income + ), + require_support_augmentation_sanitize_worker_non_target_income=( + require_support_augmentation_sanitize_worker_non_target_income + ), + minimum_calibration_quality=minimum_calibration_quality, + require_validation_passed=require_validation_passed, + required_policyengine_us_version=required_policyengine_us_version, + required_policyengine_us_git_sha=required_policyengine_us_git_sha, + require_policyengine_us_clean_build=require_policyengine_us_clean_build, + require_runtime_policyengine_us_match=require_runtime_policyengine_us_match, + ) + + +def _sha256_file(path: Path) -> str: + digest = hashlib.sha256() + with path.open("rb") as file: + for chunk in iter(lambda: file.read(1024 * 1024), b""): + digest.update(chunk) + return digest.hexdigest() + + def load_long_term_datasets( years: list[int], data_folder: str = "./projected_datasets", @@ -779,54 +921,178 @@ def load_long_term_datasets( raise FileNotFoundError(f"Long-term dataset not found: {path}") metadata, metadata_path = _load_dataset_metadata(path, require_metadata) - if metadata_path is not None: - validate_long_term_dataset_metadata( - metadata, - path=path, - year=year, - required_profile=required_profile, - required_target_source=required_target_source, - required_tax_assumption=required_tax_assumption, - required_support_augmentation_profile=( - required_support_augmentation_profile - ), - required_support_augmentation_target_year=( - required_support_augmentation_target_year - ), - required_support_augmentation_target_year_strategy=( - required_support_augmentation_target_year_strategy - ), - required_support_augmentation_blueprint_base_weight_scale=( - required_support_augmentation_blueprint_base_weight_scale - ), - require_support_augmentation_sanitize_clone_non_target_income=( - require_support_augmentation_sanitize_clone_non_target_income - ), - require_support_augmentation_sanitize_worker_non_target_income=( - require_support_augmentation_sanitize_worker_non_target_income - ), - minimum_calibration_quality=minimum_calibration_quality, - require_validation_passed=require_validation_passed, - required_policyengine_us_version=required_policyengine_us_version, - required_policyengine_us_git_sha=required_policyengine_us_git_sha, - require_policyengine_us_clean_build=( - require_policyengine_us_clean_build - ), - require_runtime_policyengine_us_match=( - require_runtime_policyengine_us_match - ), + _validate_loaded_long_term_metadata( + metadata=metadata, + metadata_path=metadata_path, + path=path, + year=year, + required_profile=required_profile, + required_target_source=required_target_source, + required_tax_assumption=required_tax_assumption, + required_support_augmentation_profile=( + required_support_augmentation_profile + ), + required_support_augmentation_target_year=( + required_support_augmentation_target_year + ), + required_support_augmentation_target_year_strategy=( + required_support_augmentation_target_year_strategy + ), + required_support_augmentation_blueprint_base_weight_scale=( + required_support_augmentation_blueprint_base_weight_scale + ), + require_support_augmentation_sanitize_clone_non_target_income=( + require_support_augmentation_sanitize_clone_non_target_income + ), + require_support_augmentation_sanitize_worker_non_target_income=( + require_support_augmentation_sanitize_worker_non_target_income + ), + minimum_calibration_quality=minimum_calibration_quality, + require_validation_passed=require_validation_passed, + required_policyengine_us_version=required_policyengine_us_version, + required_policyengine_us_git_sha=required_policyengine_us_git_sha, + require_policyengine_us_clean_build=require_policyengine_us_clean_build, + require_runtime_policyengine_us_match=require_runtime_policyengine_us_match, + ) + + dataset = _build_long_term_dataset( + path=path, + year=year, + dataset_name=dataset_name, + metadata=metadata, + metadata_path=metadata_path, + ) + result[_long_term_dataset_key(dataset_name, year)] = dataset + + return result + + +def load_managed_long_term_datasets( + years: list[int], + dataset_name: str = "long_term_cps", + require_metadata: bool = True, + required_profile: Optional[str] = None, + required_target_source: Optional[str] = None, + required_tax_assumption: Optional[str] = None, + required_support_augmentation_profile: Optional[str] = None, + required_support_augmentation_target_year: Optional[int] = None, + required_support_augmentation_target_year_strategy: Optional[str] = None, + required_support_augmentation_blueprint_base_weight_scale: Optional[float] = None, + require_support_augmentation_sanitize_clone_non_target_income: Optional[ + bool + ] = None, + require_support_augmentation_sanitize_worker_non_target_income: Optional[ + bool + ] = None, + minimum_calibration_quality: Optional[str] = None, + require_validation_passed: bool = False, + required_policyengine_us_version: Optional[str] = None, + required_policyengine_us_git_sha: Optional[str] = None, + require_policyengine_us_clean_build: bool = False, + require_runtime_policyengine_us_match: bool = True, +) -> dict[str, PolicyEngineUSDataset]: + """Load bundled long-term US datasets from the managed release manifest. + + Each requested year must have a logical dataset entry named + ``{dataset_name}_{year}`` in the bundled US manifest. For local development, + policyengine.py first checks for the corresponding sibling data-repo mirror + before falling back to the managed URI. Long-term H5 files are large, so this + helper intentionally refuses to stream remote files directly; callers should + either provide the published local mirror or use ``load_long_term_datasets`` + with an explicit local data folder. + """ + + manifest = get_release_manifest("us") + if required_policyengine_us_version is None: + required_policyengine_us_version = manifest.model_package.version + + result = {} + for year in years: + key = _long_term_dataset_key(dataset_name, year) + path_reference = manifest.datasets.get(key) + if path_reference is None: + raise ValueError( + f"Managed long-term dataset {key!r} is not present in the " + "bundled US release manifest." + ) + if not path_reference.sha256: + raise ValueError( + f"Managed long-term dataset {key!r} is missing a sha256 in " + "the bundled US release manifest." + ) + dataset_uri = resolve_managed_dataset_reference("us", key) + dataset_source = resolve_local_managed_dataset_source("us", dataset_uri) + if "://" in dataset_source: + raise FileNotFoundError( + f"Managed long-term dataset {key!r} resolves to {dataset_uri}, " + "but no local mirror exists. Download the bundled artifact into " + "the sibling policyengine-us-data storage mirror or call " + "load_long_term_datasets(..., data_folder=...) with an explicit " + "local directory." ) - dataset = PolicyEngineUSDataset( - id=f"{dataset_name}_{year}", - name=f"{dataset_name}-{year}", - description=f"US long-term projected dataset for {year}", - filepath=str(path), - year=int(year), + path = Path(dataset_source).expanduser() + actual_sha256 = _sha256_file(path) + if actual_sha256 != path_reference.sha256: + raise ValueError( + f"Managed long-term dataset {key!r} at {path} has sha256 " + f"{actual_sha256}, expected {path_reference.sha256}." + ) + metadata, metadata_path = _load_dataset_metadata(path, require_metadata) + if path_reference.metadata_sha256: + if metadata_path is None: + raise FileNotFoundError( + f"Managed long-term dataset {key!r} at {path} is missing " + "metadata sidecar required by the bundled manifest." + ) + metadata_sha256 = _sha256_file(metadata_path) + if metadata_sha256 != path_reference.metadata_sha256: + raise ValueError( + f"Managed long-term dataset {key!r} metadata at " + f"{metadata_path} has sha256 {metadata_sha256}, expected " + f"{path_reference.metadata_sha256}." + ) + _validate_loaded_long_term_metadata( + metadata=metadata, + metadata_path=metadata_path, + path=path, + year=year, + required_profile=required_profile, + required_target_source=required_target_source, + required_tax_assumption=required_tax_assumption, + required_support_augmentation_profile=( + required_support_augmentation_profile + ), + required_support_augmentation_target_year=( + required_support_augmentation_target_year + ), + required_support_augmentation_target_year_strategy=( + required_support_augmentation_target_year_strategy + ), + required_support_augmentation_blueprint_base_weight_scale=( + required_support_augmentation_blueprint_base_weight_scale + ), + require_support_augmentation_sanitize_clone_non_target_income=( + require_support_augmentation_sanitize_clone_non_target_income + ), + require_support_augmentation_sanitize_worker_non_target_income=( + require_support_augmentation_sanitize_worker_non_target_income + ), + minimum_calibration_quality=minimum_calibration_quality, + require_validation_passed=require_validation_passed, + required_policyengine_us_version=required_policyengine_us_version, + required_policyengine_us_git_sha=required_policyengine_us_git_sha, + require_policyengine_us_clean_build=require_policyengine_us_clean_build, + require_runtime_policyengine_us_match=require_runtime_policyengine_us_match, + ) + result[key] = _build_long_term_dataset( + path=path, + year=year, + dataset_name=dataset_name, metadata=metadata, - metadata_filepath=str(metadata_path) if metadata_path else None, + metadata_path=metadata_path, + dataset_uri=dataset_uri, ) - result[f"{dataset_name}_{year}"] = dataset return result diff --git a/tests/test_models.py b/tests/test_models.py index 68d08d15..6454d14b 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -113,7 +113,7 @@ def test_has_release_manifest_metadata(self): assert us_latest.release_manifest is not None assert us_latest.release_manifest.country_id == "us" assert us_latest.model_package.name == "policyengine-us" - assert us_latest.model_package.version == "1.691.10" + assert us_latest.model_package.version == "1.691.12" assert us_latest.data_package.name == "policyengine-us-data" assert us_latest.data_package.version == "1.115.3" assert ( diff --git a/tests/test_release_manifests.py b/tests/test_release_manifests.py index 7264c0d7..a47e8e93 100644 --- a/tests/test_release_manifests.py +++ b/tests/test_release_manifests.py @@ -22,6 +22,7 @@ get_release_manifest, https_release_manifest_uri, resolve_dataset_reference, + resolve_local_managed_dataset_source, resolve_managed_dataset_reference, ) from policyengine.tax_benefit_models.uk import ( @@ -38,6 +39,18 @@ PYPROJECT.read_text(), re.MULTILINE, ).group(1) +US_MODEL_VERSION = "1.691.12" +US_DATA_RELEASE_PATH = "releases/crfb-longrun-20260517/release_manifest.json" +US_DATA_RELEASE_REVISION = "crfb-longrun-20260517" +US_CERTIFICATION_SOURCE = "policyengine.py candidate long-term manifest" +US_DEFAULT_DATASET_URI = ( + "hf://policyengine/policyengine-us-data/" + "enhanced_cps_2024.h5@69fc39a7fece4c49ba87291e598e76b40568cc5d" +) +US_ENHANCED_CPS_MANAGED_URI = ( + "hf://policyengine/policyengine-us-data/" + f"enhanced_cps_2024.h5@{US_DATA_RELEASE_REVISION}" +) def _response_with_json(payload: dict) -> MagicMock: @@ -64,27 +77,24 @@ def test__given_us_manifest__then_has_pinned_model_and_data_packages(self): assert manifest.country_id == "us" assert manifest.policyengine_version == POLICYENGINE_VERSION assert manifest.model_package.name == "policyengine-us" - assert manifest.model_package.version == "1.691.10" + assert manifest.model_package.version == US_MODEL_VERSION assert manifest.data_package.name == "policyengine-us-data" assert manifest.data_package.version == "1.115.3" assert manifest.data_package.repo_id == "policyengine/policyengine-us-data" - assert ( - manifest.data_package.release_manifest_path - == "releases/1.115.3/release_manifest.json" - ) - assert ( - manifest.data_package.release_manifest_revision - == "69fc39a7fece4c49ba87291e598e76b40568cc5d" - ) + assert manifest.data_package.release_manifest_path == US_DATA_RELEASE_PATH + assert manifest.data_package.release_manifest_revision == US_DATA_RELEASE_REVISION assert manifest.certified_data_artifact is not None assert ( manifest.certified_data_artifact.build_id == "policyengine-us-data-1.115.3" ) assert manifest.certified_data_artifact.dataset == "enhanced_cps_2024" assert manifest.certification is not None - assert manifest.certification.data_build_id == "policyengine-us-data-1.115.3" - assert manifest.certification.built_with_model_version == "1.691.10" - assert manifest.certification.certified_for_model_version == "1.691.10" + assert ( + manifest.certification.data_build_id + == "policyengine-us-data-crfb-longrun-20260517" + ) + assert manifest.certification.built_with_model_version == US_MODEL_VERSION + assert manifest.certification.certified_for_model_version == US_MODEL_VERSION def test__given_uk_manifest__then_has_pinned_model_and_data_packages(self): manifest = get_release_manifest("uk") @@ -117,10 +127,7 @@ def test__given_uk_manifest__then_has_pinned_model_and_data_packages(self): def test__given_us_dataset_name__then_resolves_to_versioned_hf_url(self): resolved = resolve_dataset_reference("us", "enhanced_cps_2024") - assert ( - resolved - == "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5@69fc39a7fece4c49ba87291e598e76b40568cc5d" - ) + assert resolved == US_ENHANCED_CPS_MANAGED_URI def test__given_uk_dataset_name__then_resolves_to_versioned_hf_url(self): resolved = resolve_dataset_reference("uk", "enhanced_frs_2023_24") @@ -171,6 +178,28 @@ def test__given_versioned_dataset_url__then_logical_name_drops_version(self): assert dataset_logical_name(dataset) == "enhanced_cps_2024" + def test__given_explicit_local_data_repo__then_resolves_local_mirror( + self, monkeypatch, tmp_path + ): + local_dataset = ( + tmp_path + / "policyengine-us-data" + / "policyengine_us_data" + / "storage" + / "long_term" + / "2100.h5" + ) + local_dataset.parent.mkdir(parents=True) + local_dataset.write_text("", encoding="utf-8") + monkeypatch.setenv("POLICYENGINE_LOCAL_DATA_REPO_ROOT", str(tmp_path)) + + resolved = resolve_local_managed_dataset_source( + "us", + "hf://policyengine/policyengine-us-data/long_term/2100.h5@candidate", + ) + + assert resolved == str(local_dataset) + def test__given_country__then_can_fetch_data_release_manifest(self): get_data_release_manifest.cache_clear() payload = { @@ -240,8 +269,7 @@ def test__given_country__then_can_fetch_data_release_manifest(self): mock_get.assert_called_once() assert mock_get.call_args.args[0] == ( "https://huggingface.co/policyengine/policyengine-us-data/resolve/" - "69fc39a7fece4c49ba87291e598e76b40568cc5d/" - "releases/1.115.3/release_manifest.json" + f"{US_DATA_RELEASE_REVISION}/{US_DATA_RELEASE_PATH}" ) def test__given_explicit_manifest_revision__then_builds_manifest_url(self): @@ -249,8 +277,7 @@ def test__given_explicit_manifest_revision__then_builds_manifest_url(self): assert https_release_manifest_uri(manifest.data_package) == ( "https://huggingface.co/policyengine/policyengine-us-data/resolve/" - "69fc39a7fece4c49ba87291e598e76b40568cc5d/" - "releases/1.115.3/release_manifest.json" + f"{US_DATA_RELEASE_REVISION}/{US_DATA_RELEASE_PATH}" ) def test__given_release_manifest_artifact_uses_version_tag__then_rewrites_to_commit( @@ -281,10 +308,7 @@ def test__given_release_manifest_artifact_uses_version_tag__then_rewrites_to_com ): manifest = get_data_release_manifest("us") - assert ( - manifest.artifacts["enhanced_cps_2024"].uri - == "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5@69fc39a7fece4c49ba87291e598e76b40568cc5d" - ) + assert manifest.artifacts["enhanced_cps_2024"].uri == US_ENHANCED_CPS_MANAGED_URI assert ( manifest.source_sha256 == hashlib.sha256(json.dumps(payload).encode("utf-8")).hexdigest() @@ -394,7 +418,7 @@ def test__given_private_manifest_unavailable__then_bundled_certification_is_used ): certification = certify_data_release_compatibility( "us", - runtime_model_version="1.691.10", + runtime_model_version=US_MODEL_VERSION, ) assert certification == get_release_manifest("us").certification @@ -410,7 +434,7 @@ def test__given_manifest_request_timeout__then_bundled_certification_is_used( ): certification = certify_data_release_compatibility( "us", - runtime_model_version="1.691.10", + runtime_model_version=US_MODEL_VERSION, ) assert certification == get_release_manifest("us").certification @@ -508,7 +532,7 @@ def test__given_offline_hf__then_us_import_uses_bundled_certification( ) assert result.returncode == 0, result.stderr - assert "policyengine.py bundled manifest" in result.stdout + assert US_CERTIFICATION_SOURCE in result.stdout def test__given_mismatched_version_and_fingerprint__then_certification_fails(self): get_data_release_manifest.cache_clear() diff --git a/tests/test_us_long_term_datasets.py b/tests/test_us_long_term_datasets.py index 0990287f..7e0d43f7 100644 --- a/tests/test_us_long_term_datasets.py +++ b/tests/test_us_long_term_datasets.py @@ -1,5 +1,7 @@ +import hashlib import json from pathlib import Path +from types import SimpleNamespace import h5py import pandas as pd @@ -11,6 +13,7 @@ PolicyEngineUSDataset, USYearData, load_long_term_datasets, + load_managed_long_term_datasets, ) @@ -123,6 +126,26 @@ def _write_metadata(path: Path, year: int, **overrides) -> None: ) +def _sha256(path: Path) -> str: + return hashlib.sha256(path.read_bytes()).hexdigest() + + +def _manifest_with_long_term_sha( + sha256: str, + version: str = "1.691.12", + metadata_sha256=None, +): + return SimpleNamespace( + model_package=SimpleNamespace(version=version), + datasets={ + "long_term_cps_2100": SimpleNamespace( + sha256=sha256, + metadata_sha256=metadata_sha256, + ) + }, + ) + + def test__load_long_term_datasets__loads_h5_and_sidecar_metadata(tmp_path): h5_path = tmp_path / "2075.h5" _write_us_h5(h5_path, 2075) @@ -230,6 +253,167 @@ def test__load_long_term_datasets__rejects_support_contract_mismatch(tmp_path): ) +def test__load_managed_long_term_datasets__loads_bundled_local_mirror( + monkeypatch, + tmp_path, +): + h5_path = tmp_path / "2100.h5" + _write_us_h5(h5_path, 2100) + _write_metadata( + h5_path, + 2100, + policyengine_us={"version": "1.691.12"}, + ) + dataset_uri = "hf://policyengine/policyengine-us-data/long_term/2100.h5@abc123" + + monkeypatch.setattr( + us_datasets_module, + "get_release_manifest", + lambda country_id: _manifest_with_long_term_sha(_sha256(h5_path)), + ) + monkeypatch.setattr( + us_datasets_module, + "resolve_managed_dataset_reference", + lambda country_id, dataset: dataset_uri, + ) + monkeypatch.setattr( + us_datasets_module, + "resolve_local_managed_dataset_source", + lambda country_id, uri: str(h5_path), + ) + + datasets = load_managed_long_term_datasets( + [2100], + required_profile="ss-payroll-tob", + required_target_source="trustees_2025_current_law", + required_tax_assumption="trustees-core-thresholds-v1", + minimum_calibration_quality="exact", + require_validation_passed=True, + ) + + dataset = datasets["long_term_cps_2100"] + assert dataset.filepath == str(h5_path) + assert dataset.metadata["policyengine_bundle"] == { + "managed_by": "policyengine.py", + "runtime_dataset": "long_term_cps_2100", + "runtime_dataset_uri": dataset_uri, + } + + +def test__load_managed_long_term_datasets__defaults_to_manifest_model_version( + monkeypatch, + tmp_path, +): + h5_path = tmp_path / "2100.h5" + _write_us_h5(h5_path, 2100) + _write_metadata(h5_path, 2100, policyengine_us={"version": "1.691.10"}) + dataset_uri = "hf://policyengine/policyengine-us-data/long_term/2100.h5@abc123" + + monkeypatch.setattr( + us_datasets_module, + "get_release_manifest", + lambda country_id: _manifest_with_long_term_sha(_sha256(h5_path)), + ) + monkeypatch.setattr( + us_datasets_module, + "resolve_managed_dataset_reference", + lambda country_id, dataset: dataset_uri, + ) + monkeypatch.setattr( + us_datasets_module, + "resolve_local_managed_dataset_source", + lambda country_id, uri: str(h5_path), + ) + + with pytest.raises(ValueError, match="policyengine_us.version"): + load_managed_long_term_datasets([2100]) + + +def test__load_managed_long_term_datasets__checks_manifest_sha256( + monkeypatch, + tmp_path, +): + h5_path = tmp_path / "2100.h5" + _write_us_h5(h5_path, 2100) + _write_metadata(h5_path, 2100, policyengine_us={"version": "1.691.12"}) + dataset_uri = "hf://policyengine/policyengine-us-data/long_term/2100.h5@abc123" + + monkeypatch.setattr( + us_datasets_module, + "get_release_manifest", + lambda country_id: _manifest_with_long_term_sha("0" * 64), + ) + monkeypatch.setattr( + us_datasets_module, + "resolve_managed_dataset_reference", + lambda country_id, dataset: dataset_uri, + ) + monkeypatch.setattr( + us_datasets_module, + "resolve_local_managed_dataset_source", + lambda country_id, uri: str(h5_path), + ) + + with pytest.raises(ValueError, match="sha256"): + load_managed_long_term_datasets([2100]) + + +def test__load_managed_long_term_datasets__checks_metadata_sha256( + monkeypatch, + tmp_path, +): + h5_path = tmp_path / "2100.h5" + _write_us_h5(h5_path, 2100) + _write_metadata(h5_path, 2100, policyengine_us={"version": "1.691.12"}) + dataset_uri = "hf://policyengine/policyengine-us-data/long_term/2100.h5@abc123" + + monkeypatch.setattr( + us_datasets_module, + "get_release_manifest", + lambda country_id: _manifest_with_long_term_sha( + _sha256(h5_path), + metadata_sha256="0" * 64, + ), + ) + monkeypatch.setattr( + us_datasets_module, + "resolve_managed_dataset_reference", + lambda country_id, dataset: dataset_uri, + ) + monkeypatch.setattr( + us_datasets_module, + "resolve_local_managed_dataset_source", + lambda country_id, uri: str(h5_path), + ) + + with pytest.raises(ValueError, match="metadata"): + load_managed_long_term_datasets([2100]) + + +def test__load_managed_long_term_datasets__requires_local_mirror( + monkeypatch, +): + dataset_uri = "hf://policyengine/policyengine-us-data/long_term/2100.h5@abc123" + monkeypatch.setattr( + us_datasets_module, + "get_release_manifest", + lambda country_id: _manifest_with_long_term_sha("0" * 64), + ) + monkeypatch.setattr( + us_datasets_module, + "resolve_managed_dataset_reference", + lambda country_id, dataset: dataset_uri, + ) + monkeypatch.setattr( + us_datasets_module, + "resolve_local_managed_dataset_source", + lambda country_id, uri: uri, + ) + + with pytest.raises(FileNotFoundError, match="no local mirror exists"): + load_managed_long_term_datasets([2100]) + + def test__load_long_term_datasets__rejects_policyengine_us_version_mismatch( tmp_path, ): @@ -309,3 +493,34 @@ def test__load_long_term_datasets__can_require_runtime_policyengine_us_match( assert datasets["long_term_cps_2075"].metadata["policyengine_us"]["version"] == ( "1.691.10" ) + + +def test__load_long_term_datasets__can_require_runtime_policyengine_us_hash_match( + monkeypatch, + tmp_path, +): + h5_path = tmp_path / "2075.h5" + _write_us_h5(h5_path, 2075) + _write_metadata( + h5_path, + 2075, + policyengine_us={ + "version": "1.691.12", + "package_tree_sha256": "a" * 64, + }, + ) + monkeypatch.setattr( + us_datasets_module, + "_runtime_policyengine_us_metadata", + lambda: { + "version": "1.691.12", + "package_tree_sha256": "b" * 64, + }, + ) + + with pytest.raises(ValueError, match="package_tree_sha256"): + load_long_term_datasets( + [2075], + data_folder=str(tmp_path), + require_runtime_policyengine_us_match=True, + ) diff --git a/uv.lock b/uv.lock index 5dad10c5..ca3b973d 100644 --- a/uv.lock +++ b/uv.lock @@ -2411,7 +2411,7 @@ wheels = [ [[package]] name = "policyengine" -version = "4.5.0" +version = "4.5.1" source = { editable = "." } dependencies = [ { name = "h5py", version = "3.14.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, @@ -2478,13 +2478,13 @@ requires-dist = [ { name = "pandas", specifier = ">=2.0.0" }, { name = "plotly", marker = "extra == 'dev'", specifier = ">=5.0.0" }, { name = "plotly", marker = "extra == 'plotting'", specifier = ">=5.0.0" }, - { name = "policyengine-core", marker = "extra == 'dev'", specifier = ">=3.26.1" }, + { name = "policyengine-core", marker = "extra == 'dev'", specifier = "==3.26.1" }, { name = "policyengine-core", marker = "extra == 'uk'", specifier = ">=3.26.1" }, - { name = "policyengine-core", marker = "extra == 'us'", specifier = ">=3.26.1" }, + { name = "policyengine-core", marker = "extra == 'us'", specifier = "==3.26.1" }, { name = "policyengine-uk", marker = "extra == 'dev'", specifier = "==2.88.14" }, { name = "policyengine-uk", marker = "extra == 'uk'", specifier = "==2.88.14" }, - { name = "policyengine-us", marker = "extra == 'dev'", specifier = "==1.691.10" }, - { name = "policyengine-us", marker = "extra == 'us'", specifier = "==1.691.10" }, + { name = "policyengine-us", marker = "extra == 'dev'", specifier = "==1.691.12" }, + { name = "policyengine-us", marker = "extra == 'us'", specifier = "==1.691.12" }, { name = "psutil", specifier = ">=5.9.0" }, { name = "pydantic", specifier = ">=2.0.0" }, { name = "pytest", marker = "extra == 'dev'" }, @@ -2547,7 +2547,7 @@ wheels = [ [[package]] name = "policyengine-us" -version = "1.691.10" +version = "1.691.12" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "microdf-python" }, @@ -2559,9 +2559,9 @@ dependencies = [ { name = "tables", version = "3.11.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, { name = "tqdm" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/68/dc/f9b45c19c17ec7f34346feaf2aad363c859baba6cce8dfd42b779872f093/policyengine_us-1.691.10.tar.gz", hash = "sha256:f545db6f95ad49441262520613b905d5b3156fb05099712fa77908c49d86e804", size = 9507509, upload-time = "2026-05-16T03:36:23.612Z" } +sdist = { url = "https://files.pythonhosted.org/packages/4a/23/c8eb34c1c0c0e8150fba8467f1f8463f606a68c5b28fd31ad637cecd886d/policyengine_us-1.691.12.tar.gz", hash = "sha256:435fa2a8c7085f13a7d9d2ce903670f9d40ee7538a2db28a2dfae038d2bfa91a", size = 9507768, upload-time = "2026-05-16T16:54:37.905Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/a7/a3/53630e30285cb441cc96e5264621b1b9e5af3b648ccd176b376f0e03a458/policyengine_us-1.691.10-py3-none-any.whl", hash = "sha256:a2c8365f152c2ec3332e996c4003d560b433b6c204bdfcc73bf094a0ae49f10e", size = 10022315, upload-time = "2026-05-16T03:36:18.772Z" }, + { url = "https://files.pythonhosted.org/packages/83/6f/b605fc1d8e06e377ae50870dc44a28cfe6562f0032e36dd53a5ef49472db/policyengine_us-1.691.12-py3-none-any.whl", hash = "sha256:ef43482bd8c6cc16f8f1d4050423f5dc1d045af15931f5d1b089715a31c839d2", size = 10022960, upload-time = "2026-05-16T16:54:35.255Z" }, ] [[package]] From 7aa5443c9dfbb7b9e3736de03e1cb5626a91c13c Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sun, 17 May 2026 16:31:50 -0400 Subject: [PATCH 2/2] Fix long-run bundle PR checks --- changelog.d/crfb-longrun-bundle.added.md | 1 + src/policyengine/provenance/manifest.py | 7 +------ tests/test_release_manifests.py | 8 ++++++-- 3 files changed, 8 insertions(+), 8 deletions(-) create mode 100644 changelog.d/crfb-longrun-bundle.added.md diff --git a/changelog.d/crfb-longrun-bundle.added.md b/changelog.d/crfb-longrun-bundle.added.md new file mode 100644 index 00000000..f5f8f82b --- /dev/null +++ b/changelog.d/crfb-longrun-bundle.added.md @@ -0,0 +1 @@ +Added the US long-run managed dataset bundle. diff --git a/src/policyengine/provenance/manifest.py b/src/policyengine/provenance/manifest.py index 9332cc92..9de4596e 100644 --- a/src/policyengine/provenance/manifest.py +++ b/src/policyengine/provenance/manifest.py @@ -517,12 +517,7 @@ def resolve_local_managed_dataset_source( ) for candidate_repo_root in explicit_repo_roots: - local_path = ( - candidate_repo_root - / data_package_name - / "storage" - / path_in_repo - ) + local_path = candidate_repo_root / data_package_name / "storage" / path_in_repo if local_path.exists(): return str(local_path) diff --git a/tests/test_release_manifests.py b/tests/test_release_manifests.py index a47e8e93..0924959c 100644 --- a/tests/test_release_manifests.py +++ b/tests/test_release_manifests.py @@ -82,7 +82,9 @@ def test__given_us_manifest__then_has_pinned_model_and_data_packages(self): assert manifest.data_package.version == "1.115.3" assert manifest.data_package.repo_id == "policyengine/policyengine-us-data" assert manifest.data_package.release_manifest_path == US_DATA_RELEASE_PATH - assert manifest.data_package.release_manifest_revision == US_DATA_RELEASE_REVISION + assert ( + manifest.data_package.release_manifest_revision == US_DATA_RELEASE_REVISION + ) assert manifest.certified_data_artifact is not None assert ( manifest.certified_data_artifact.build_id == "policyengine-us-data-1.115.3" @@ -308,7 +310,9 @@ def test__given_release_manifest_artifact_uses_version_tag__then_rewrites_to_com ): manifest = get_data_release_manifest("us") - assert manifest.artifacts["enhanced_cps_2024"].uri == US_ENHANCED_CPS_MANAGED_URI + assert ( + manifest.artifacts["enhanced_cps_2024"].uri == US_ENHANCED_CPS_MANAGED_URI + ) assert ( manifest.source_sha256 == hashlib.sha256(json.dumps(payload).encode("utf-8")).hexdigest()