Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion app/src/data.json

Large diffs are not rendered by default.

75 changes: 39 additions & 36 deletions paper/snapshot/20260501/manifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -15,46 +15,46 @@
"path": "paper/snapshot/20260501/runs/us_full_run_20260513_policyengine_4_4_4_nested_outputs",
"prompt_payload_sha256": "15e3c512a846fe38b396552dacfdab9f7f4af6de0f00dc3f3ff2cff1cec587a0",
"files": {
"analysis/impact_summary_by_model.csv": "fb41e35ec39536755c65b1d7f0fc5292547dbb7f97a0d4a9df74b94f76eb9fb1",
"analysis/metrics.csv": "45772916942e3bc0b67225bae50c6e847de38ca84f47d48e05fe8b9d15679048",
"analysis/report.md": "3dd85897ace3ba30b02d076fb0db352290d5a9e2e504de97cce3249dd91c1ea8",
"analysis/summary_by_model.csv": "6fe934cad610310f4f4fb8f4e1e19800b33c9ed22533a600ac795fb70767cfa6",
"analysis/summary_by_variable.csv": "0d30219cd09e0061fb83d7f47a35c828026a0174568aa067aba8ad3cb5f3fa74",
"analysis/impact_summary_by_model.csv": "4c6e0eea3f2aa50d35e6503aeae28e650cd910cd51e69ddaf9e33162a5f71bdd",
"analysis/metrics.csv": "beb4fb69875f9d845bdee8a290269ac3be899028d8bf507977cd12e8e34dfcf9",
"analysis/report.md": "e7f72cde18b792b16834ca68a2d16a9e6e0b80184f55cf426dfee8d6c65a6c44",
"analysis/summary_by_model.csv": "550e42c7d110f3a8b9d6d534c04b484bb493058a95136b05992ab8c97f47eaa9",
"analysis/summary_by_variable.csv": "e0a6072efdb577b728c7f87d9dab6a93ed20d45cce30ba2be7d11b345c287371",
"analysis/usage_summary.csv": "0c86a1d60f4e9c8228e0a0f36ffedabf7c5d9a52ecb8e6b87466e3799db48039",
"data.json": "1016061c8f4a089d54188c66f041360c62d6087da7e8fdf7c4a84d01d3b06b16",
"data.json": "9de0934382e3f1a22642599b1066547ad3b6eaf931f050ae0d8cb0fac04e7351",
"predictions.csv.gz": "c37d8b0144ebb26b52af8c799c72043461a7b50f36e32c0b7917d84ab1f36102",
"reference_outputs.csv": "04febaadd091dfa7de97fcf71a8a52d3db2296ca98fe1c97f9be92c4bf1383ce",
"reference_outputs.csv.meta.json": "4db925a1841ce8549d99e3e69cd5796dab3ca960da08a6108185f5cbc0190f8c",
"reference_outputs.csv": "2a2e0294d2aa6170822507a7ebcc6ad3f58a5671f548c78349cba5f7918a874f",
"reference_outputs.csv.meta.json": "343d15410e397781c85c18b9fb10bf79f223a73655d740261ff17f345a18ce1b",
"scenarios.csv": "b05091225c066b652c11d16bf1c78778193d4f3ee277ce8aed8f0a7536c0e66c",
"scenarios.csv.meta.json": "ec705caa68554f008ef7b6e76e1e5d9b1f9768588e786e35438562b4dfb00074"
"scenarios.csv.meta.json": "935035bf37612fdc1c35cc7bd1a0335f5a320f3cac4b7d32463108cbc0810e23"
}
},
"uk_full_run_20260513_policyengine_4_4_4_nested_outputs": {
"path": "paper/snapshot/20260501/runs/uk_full_run_20260513_policyengine_4_4_4_nested_outputs",
"prompt_payload_sha256": "2f64b11a99aac4b4e1e150e397481b9139ae38d6f8e13065de2220f4b210dbfe",
"files": {
"analysis/impact_summary_by_model.csv": "4d43581d8175e15cb057394c50b3fad257cd3274095c81e7dbd99134271d096d",
"analysis/metrics.csv": "6fee6d59c507c9bec2ada2dfda5ed499f5771e2e81526110043a7203ff1db86f",
"analysis/report.md": "7afeda8b6bd49b1cfea5d61c620aaa52dd67fb4734ecd614a7106b61ede34ece",
"analysis/summary_by_model.csv": "8261c40f6a5e1ecfcde3acd4ea3be5924cd4619df9679f0ed5fdbbe2d6497f92",
"analysis/summary_by_variable.csv": "4bcfa8a601974949caadacd058a49d0caec09480d53a54bf48292a9f3663d667",
"analysis/impact_summary_by_model.csv": "323bf23f6e63a4a53a690da5f68fa43194f59eee3dc1763c1048a15ea871ffb7",
"analysis/metrics.csv": "3c91d3725e964c574f932d74bbf4abc674df66649745f8b5cffaf7417800eea5",
"analysis/report.md": "dd9d7c49bcbfaf0dd83b20c3f0ff23653d46daeae2ff1385c8e1157126e94568",
"analysis/summary_by_model.csv": "09dada69c49e955ee7c7f2e167c97b9299f4ec000dc628925f305f1cf448781d",
"analysis/summary_by_variable.csv": "058f16789dac25de666b754e565dc73777ba373263491d973e4a8364e6f838f1",
"analysis/usage_summary.csv": "0246ee29ddea4091842ec6f471cae37cb2064159cccb6b9e6a03890ad910bdf8",
"data.json": "9dc661d1ee7e411c7802ffbbee4ae7d3be2cb1e883e272a02cd00fd780a9310c",
"data.json": "4ae6324978b35c39387ec741824feb1f6a5ae784ff685491e635fee621a4ab6b",
"predictions.csv.gz": "808a2583ab145120442cecb91f3ca2cb5d194a8a825d7417a6ffc1362657ed5d",
"reference_outputs.csv": "6286a238e54a0b948385a60a73f36a1aae3cb9bf6612fa28c530d0fd3e024aae",
"reference_outputs.csv.meta.json": "5634984705ff8312936a2b57fdc339efef4747ee270949a2caa23a7d203da9e8",
"reference_outputs.csv": "8c98e2822a785902f9b9671f73dceabcf789af5c67378b3ba6de1d544354d1b1",
"reference_outputs.csv.meta.json": "52e57839228c140638c80362fcede6c960973ec6550153e7c5d5fb371a62f2b4",
"scenarios.csv": "eee79fb1db36851fd3b0d3fd343c1dbe62f86222f45b1fe806a5e0f4e49645d2",
"scenarios.csv.meta.json": "41a7c2cacf06f452164057ec4092c386938f2243a97ba30e0e3f0182f3ad6e34"
"scenarios.csv.meta.json": "56961324be94e856d25e6ebfbeb794f56432f755e32da6429a26f80a13c67a00"
}
}
},
"committed_snapshot_artifacts": {
"us_scenarios.csv": "b05091225c066b652c11d16bf1c78778193d4f3ee277ce8aed8f0a7536c0e66c",
"us_reference_outputs.csv": "04febaadd091dfa7de97fcf71a8a52d3db2296ca98fe1c97f9be92c4bf1383ce",
"us_impact_summary_by_model.csv": "fb41e35ec39536755c65b1d7f0fc5292547dbb7f97a0d4a9df74b94f76eb9fb1",
"us_reference_outputs.csv": "2a2e0294d2aa6170822507a7ebcc6ad3f58a5671f548c78349cba5f7918a874f",
"us_impact_summary_by_model.csv": "4c6e0eea3f2aa50d35e6503aeae28e650cd910cd51e69ddaf9e33162a5f71bdd",
"uk_scenarios.csv": "eee79fb1db36851fd3b0d3fd343c1dbe62f86222f45b1fe806a5e0f4e49645d2",
"uk_reference_outputs.csv": "6286a238e54a0b948385a60a73f36a1aae3cb9bf6612fa28c530d0fd3e024aae",
"uk_impact_summary_by_model.csv": "4d43581d8175e15cb057394c50b3fad257cd3274095c81e7dbd99134271d096d"
"uk_reference_outputs.csv": "8c98e2822a785902f9b9671f73dceabcf789af5c67378b3ba6de1d544354d1b1",
"uk_impact_summary_by_model.csv": "323bf23f6e63a4a53a690da5f68fa43194f59eee3dc1763c1048a15ea871ffb7"
},
"rendered_paper_artifacts": {
"pdf": {
Expand Down Expand Up @@ -86,7 +86,7 @@
},
"reproducibility_notes": [
"The top-level scenario, refreshed reference-output, and impact-summary CSVs are byte-identical to the corresponding compact source-run artifacts copied under paper/snapshot/20260501/runs/.",
"Most model responses are from the May 13, 2026 source run; Gemini 3.5 Flash was added on 2026-05-20 using the same prompts, scenarios, and reference outputs. Reference outputs and derived scores were refreshed on 2026-05-15 with policyengine.py 4.4.4, policyengine-us 1.691.10 from Git SHA 4fd79e6608bc2dac3a7fde0be37191cb4870bd85, and policyengine-uk 2.88.16. The PE-UK stock-capital period-handling update did not change any reference value in the frozen UK sample.",
"Most model responses are from the May 13, 2026 source run; Gemini 3.5 Flash was added on 2026-05-20 using the same prompts and scenarios. Reference outputs and derived scores were refreshed again on 2026-05-23 with policyengine.py 4.10.0, policyengine-us 1.705.1 from Git SHA 7a7791f7a71e53629ff7b682a6960f3ab3a9e594, and policyengine-uk 2.88.22 from Git SHA 7445869cfed59248be53778588856c2d688b34be after PolicyEngine/policyengine-us#8374 and PolicyEngine/policyengine-uk#1691.",
"Canonical prediction files include parser recovery, accepted full-response retries, and final row-level contract repairs. The preserved retry and repair artifacts retain the failed attempts and replaced original rows.",
"Raw provider responses are retained in compressed source-run predictions.csv.gz files. The separate LiteLLM cache remains local-only because it is a generated request cache, not the canonical snapshot artifact.",
"Model APIs and upstream model aliases may change after the recorded 2026-05-13 to 2026-05-20 response window, so exact reruns can diverge even with the committed household inputs, reference outputs, parsed dashboard export, and analysis summaries."
Expand All @@ -113,20 +113,23 @@
},
"model_response_date": "2026-05-13 to 2026-05-20",
"reference_output_refresh": {
"date": "2026-05-15",
"policyengine_version": "4.4.4",
"policyengine_us_version": "1.691.10",
"policyengine_us_git_sha": "4fd79e6608bc2dac3a7fde0be37191cb4870bd85",
"policyengine_uk_version": "2.88.16"
"date": "2026-05-23",
"policyengine_version": "4.10.0",
"policyengine_us_version": "1.705.1",
"policyengine_us_git_sha": "7a7791f7a71e53629ff7b682a6960f3ab3a9e594",
"policyengine_us_note": "Refreshed after PolicyEngine/policyengine-us#8374 payroll-tax 401(k)/FICA fix.",
"policyengine_uk_version": "2.88.22",
"policyengine_uk_git_sha": "7445869cfed59248be53778588856c2d688b34be",
"policyengine_uk_note": "Refreshed after PolicyEngine/policyengine-uk#1691 2026-27 benefit and tax parameter update."
},
"files": [
{
"path": "runs/us_full_run_20260513_policyengine_4_4_4_nested_outputs/data.json",
"sha256": "1016061c8f4a089d54188c66f041360c62d6087da7e8fdf7c4a84d01d3b06b16"
"sha256": "9de0934382e3f1a22642599b1066547ad3b6eaf931f050ae0d8cb0fac04e7351"
},
{
"path": "runs/uk_full_run_20260513_policyengine_4_4_4_nested_outputs/data.json",
"sha256": "9dc661d1ee7e411c7802ffbbee4ae7d3be2cb1e883e272a02cd00fd780a9310c"
"sha256": "4ae6324978b35c39387ec741824feb1f6a5ae784ff685491e635fee621a4ab6b"
}
],
"response_retry_artifacts": {
Expand Down Expand Up @@ -195,14 +198,14 @@
"note": "Developer-led row and case annotations for every wrong prediction row in the repaired frozen snapshot. Final failure_source values are limited to llm_error.",
"files": {
"README.md": "f8f76a07a2214235d51f6567469c51f615be9e9556b7c1c7c433c2a42b0009d7",
"uk_benefit_annotations.csv": "ce1240613d27705cf3edc5fd5263116d9ee5a1444ec6f3c57cf8b747b21d80f4",
"uk_case_notes.csv": "77904fbd7cba33b4f35e8234eb79f4c43c639c78f54cbad3f259d378b438ab04",
"uk_benefit_annotations.csv": "912c2267b6cbe3f103569331dfaa49917a1cce05a44d6f61948df95f656ed96f",
"uk_case_notes.csv": "cdb46ef8726551e506f93c89eed15b8b99404ca46f638ebef1fb423f7c56a840",
"uk_case_reference_explanations.csv": "8cb5c2721e947a7503e98ebb897ffe2f5d0bee6f69b9e6542dc1f66afa165c16",
"uk_tax_annotations.csv": "95aee45ec30391d91e0b0ff7bab0c2bd16e3535f29cc846e456e0da3f885ce05",
"us_benefit_annotations.csv": "2882e221faa00ba2e4f539a8138bf6d3bbfa3e14ffab73501cce99b8185c32cb",
"us_case_notes.csv": "beb0a130ebc6ca5264f181225b6bb56cdb6aa22ddd72841573092e333f72b71b",
"uk_tax_annotations.csv": "a432dc274e0ccf84578eb57fee4a367600d790f76b3d92fc760496e9fdb07e1e",
"us_benefit_annotations.csv": "97ea18c03e04cd0a2554fb5e94bcb88e5dd8a3fedb8743bb39f38cdd322b567a",
"us_case_notes.csv": "11113b0e61230eb16f64670d6e7e18b5673e3f88d0d42fa807966b0ff02a1111",
"us_case_reference_explanations.csv": "74a118e21cca22855f957d1482d1a0d9807fb0da159d00541c2e5ec6b4376466",
"us_tax_annotations.csv": "9f87399422822103fc9487f96be600d06cbd75b08a7a83cb8abb63bda2656c7f"
"us_tax_annotations.csv": "a99153942d4dfffd8469a74d50b1249ccb91735c18355c08899cd818e954a14d"
}
},
"row_repair_artifacts": {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
model,mean_impact_score,mean_household_score,mean_household_coverage,households,total_variables,parsed_variables,floor_share
gpt-5.5,0.8919425055118647,0.954344792415381,1.0,100,700,700,0.3
claude-sonnet-4.6,0.8711053331963318,0.9392503058676616,1.0,100,700,700,0.3
gemini-3.1-pro-preview,0.8642317502764404,0.9447985279307975,1.0,100,700,700,0.3
gemini-3.5-flash,0.8588904415104887,0.9386889516613488,1.0,100,700,700,0.3
grok-4.20,0.8563325063512033,0.9391958032416197,1.0,100,700,700,0.3
claude-opus-4.7,0.8537603246897184,0.9368854149249966,1.0,100,700,700,0.3
gemini-3-flash-preview,0.8185531920307417,0.9268658948980719,1.0,100,700,700,0.3
gemini-3.1-flash-lite-preview,0.7754507823524542,0.896382101962189,1.0,100,700,700,0.3
grok-4.3,0.7331042932126658,0.8875855328533854,1.0,100,700,700,0.3
claude-haiku-4.5,0.6065472179837889,0.84771671666668,1.0,100,700,700,0.3
gpt-5.4-mini,0.5724854864325197,0.8411461177829542,1.0,100,700,700,0.3
grok-4.1-fast,0.5621234757408923,0.8399590878092995,1.0,100,700,700,0.3
gpt-5.4-nano,0.5070068961652234,0.7784150343086544,1.0,100,700,700,0.3
gpt-5.5,0.9108533738200049,0.9635913030672099,1.0,100,700,700,0.3
claude-sonnet-4.6,0.8903358816642252,0.9485374304240298,1.0,100,700,700,0.3
gemini-3.1-pro-preview,0.8739760219510613,0.9521583177567288,1.0,100,700,700,0.3
gemini-3.5-flash,0.8702926360271063,0.9460206234579737,1.0,100,700,700,0.3
grok-4.20,0.8688119040639996,0.9470257251808101,1.0,100,700,700,0.3
claude-opus-4.7,0.8670758689495704,0.9449554279141591,1.0,100,700,700,0.3
gemini-3-flash-preview,0.8204422081665875,0.929493103778789,1.0,100,700,700,0.3
gemini-3.1-flash-lite-preview,0.7773427548196474,0.8984227294016294,1.0,100,700,700,0.3
grok-4.3,0.7435319639378237,0.8946598794252497,1.0,100,700,700,0.3
claude-haiku-4.5,0.6140941751031156,0.8505876507121914,1.0,100,700,700,0.3
gpt-5.4-mini,0.577446782392934,0.8458879528032149,1.0,100,700,700,0.3
grok-4.1-fast,0.5728734508668789,0.8469043286828928,1.0,100,700,700,0.3
gpt-5.4-nano,0.5107235963771908,0.7800365180290781,1.0,100,700,700,0.3
Loading