PolicyEngine · MaxGhenis · May 19, 2026 · May 19, 2026 · May 19, 2026
diff --git a/app/public/paper/policybench.pdf b/app/public/paper/policybench.pdf
diff --git a/app/public/paper/web/index.html b/app/public/paper/web/index.html
diff --git a/app/src/components/Methodology.tsx b/app/src/components/Methodology.tsx
@@ -264,23 +264,29 @@ export default function Methodology({
           facts. It excludes intermediate tax bases, payroll subcomponents, and
           outputs that mainly require unavailable history, restricted local
           market data, or program take-up assignment. WIC is scored as
-          person-level eligibility, not as a dollar amount.
+          person-level eligibility, not as a dollar amount. Local income tax
+          is retained as a displayed requested output, but currently receives
+          zero default population-impact weight because the full Enhanced CPS
+          source has no positive modeled local-income-tax records.
         </SectionCard>
 
         <SectionCard title="Scoring and weighting">
-          The public leaderboard uses bounded global variable weights. For each
-          household, the row score is{" "}
+          The public leaderboard uses population household-impact weights. For
+          each household, the row score is{" "}
           <code>max(0, 1 − |pred − ref| / |ref|)</code> when the reference is
           nonzero and matches exactly when the reference is zero; the same
           formula handles boolean eligibility flags naturally because
-          ref ∈ {"{0, 1}"} gives a 0/1 score directly. Each household&apos;s
-          per-variable share is{" "}
+          ref ∈ {"{0, 1}"} gives a 0/1 score directly. Each full source
+          household&apos;s per-output share is{" "}
           <code>|ref| / max(|household_net_income|, Σ |ref|)</code>, a value in
           [0, 1] that&apos;s strictly less than one when net income dominates
           the gross tax-benefit flow and equals one only when programs cancel
-          each other out. The mean of those shares across all benchmark
-          households is then renormalized so the global variable weights sum
-          to one, and those weights are applied to score every household.
+          each other out. Those shares are averaged using calibrated household
+          weights in the full source population, then renormalized so the
+          output weights sum to one. US weights use the full Enhanced CPS; UK
+          weights use the full enhanced FRS. The weights are then applied to
+          the fixed benchmark households and renormalized within each household
+          over requested outputs.
           {country === "us"
             ? " Person-level eligibility flags like Medicaid carry weight through PolicyEngine's paired per-capita value (e.g. medicaid_value), so the LLM is graded only on the boolean call itself."
             : " Person-level eligibility flags carry weight through PolicyEngine's paired per-capita value, so the LLM is graded only on the boolean call itself."}

diff --git a/app/src/data.json b/app/src/data.json
diff --git a/app/src/lib/sensitivity.ts b/app/src/lib/sensitivity.ts
@@ -27,13 +27,13 @@ export const SENSITIVITY_VIEWS: SensitivityView[] = [
     id: "household",
     label: "Household",
     description:
-      "Bounded global variable weights — each variable's per-household share is |ref| / max(|household_net_income|, Σ|ref|) (a value in [0, 1] that's < 1 when net income dominates and = 1 only when programs cancel out). Those shares are averaged across households and the resulting global weights are renormalized to sum to one before being applied to score every household. One household = one household, weight reflects each variable's net-income share.",
+      "Population household-impact weights — each output group's share is |ref| / max(|household_net_income|, Σ|ref|) in the full source microsimulation population, averaged with household weights and renormalized before scoring each benchmark household. US weights use the full Enhanced CPS; UK weights use the full enhanced FRS.",
   },
   {
     id: "aggregate",
     label: "Aggregate",
     description:
-      "Budget-weighted — each variable's weight is its share of total absolute reference dollars across the benchmark, renormalized within each household so per-household weights sum to one. One dollar of impact = one dollar.",
+      "Budget-weighted — each output group's weight is its share of total absolute reference dollars in the full source microsimulation population, renormalized within each benchmark household. One dollar of impact = one dollar.",
   },
   {
     id: "equal",

diff --git a/app/tests/programFilters.test.ts b/app/tests/programFilters.test.ts
@@ -9,7 +9,7 @@ import {
 } from "../src/lib/programFilters";
 
 function assertClose(actual: number | undefined, expected: number): void {
-  assert.notEqual(actual, undefined);
+  assert.ok(actual !== undefined);
   assert.ok(Math.abs(actual - expected) < 1e-9);
 }
 

diff --git a/paper/index.qmd b/paper/index.qmd
diff --git a/paper/snapshot/20260501/manifest.json b/paper/snapshot/20260501/manifest.json
@@ -17,11 +17,11 @@
       "files": {
         "analysis/impact_summary_by_model.csv": "280b6e996f9f5c437e9a5b30c1c814d8d139b33c69782a0d9585b71c8dd54865",
         "analysis/metrics.csv": "67ff07b37d7fe2516c1453a1b8cf7ff15a37df219e954a80ac486f94a62d088c",
-        "analysis/report.md": "840031d5936233e9ea02474a1322f480412edb41189db7311ac18bc2f2e2f66d",
-        "analysis/summary_by_model.csv": "3816e8d00aee3d99678e194f71823f6e9658f99dfa9249d5b8fc5131b7c5bb5f",
+        "analysis/report.md": "43930f17d91dce2bde3a4f2119972e077c165e0fc0fe2d3b1976e3bef028a06b",
+        "analysis/summary_by_model.csv": "2836cb83d073abcf8b3108566867c7454f19489793faaf3cc39fe457a6e99115",
         "analysis/summary_by_variable.csv": "a97f73edf7473018ee4481be7e3bc714618e48c846d9427be66f4faba3ce394c",
         "analysis/usage_summary.csv": "9124504feb192a1fe635022f9acecd7599996e61ffad6036fa16a6bad4335040",
-        "data.json": "efcbce10da14ffa2615519b52e8f5b156a40279ac931fd628031bf6e009a9d7f",
+        "data.json": "1047cd8b676dd79b5fde3a15ae74967615fdedb352f5f3a4812fdaa85f58fca4",
         "predictions.csv.gz": "46c030a3d9cb3a2e90652f99bc741e32db7c4f2499018417e75497ecbada74f3",
         "reference_outputs.csv": "04febaadd091dfa7de97fcf71a8a52d3db2296ca98fe1c97f9be92c4bf1383ce",
         "reference_outputs.csv.meta.json": "4db925a1841ce8549d99e3e69cd5796dab3ca960da08a6108185f5cbc0190f8c",
@@ -35,11 +35,11 @@
       "files": {
         "analysis/impact_summary_by_model.csv": "37e1fba0d40f9edbc1493581ce0daf5cd413c0bc39b0d6a57f99c3f248a544b2",
         "analysis/metrics.csv": "7645e9ed00018a3e68c4ab935ccb42e5c0ce49539eee92c9c1d5e5bad19d4315",
-        "analysis/report.md": "74ae0cd6bbef6c8257f8ee4c38718c0f4a4941a3db3eeffb13b71f6cdaf9e83f",
-        "analysis/summary_by_model.csv": "f9e3c7539b8037819e6deb57a8dd503b8f4d9bcce750560da54927a6d32684ce",
+        "analysis/report.md": "f6a7cb48af2778e53c06a9fcadb40dd84d0a7b95b586d214e0279b16b4c233a1",
+        "analysis/summary_by_model.csv": "1c9cb7d669f4ae5a47afb08a3d31dced85b7de5b17865f110aedb099809e15b2",
         "analysis/summary_by_variable.csv": "bc1117874633598cda8ac2c608d99b6bf5e34e8e8907256e8570702e24bc6e18",
         "analysis/usage_summary.csv": "a8f7f43ed996daf28ccbd3d8d514b86c90fca11146e66eb1fc86b9c08ff18b66",
-        "data.json": "9c1c2a47021432c571d19ac2ebcfedd1c52e530aa1fade7d2269528cd0229aa7",
+        "data.json": "2dd2327856b830fdc5043bf45e8a0e1357a95020cc10286da8aae1be8fd3d6e3",
         "predictions.csv.gz": "34502eb863e0e67e149b852c9a9676032a105044ced5c97898d1d1f59b08013c",
         "reference_outputs.csv": "6286a238e54a0b948385a60a73f36a1aae3cb9bf6612fa28c530d0fd3e024aae",
         "reference_outputs.csv.meta.json": "5634984705ff8312936a2b57fdc339efef4747ee270949a2caa23a7d203da9e8",
@@ -59,14 +59,14 @@
   "rendered_paper_artifacts": {
     "pdf": {
       "path": "app/public/paper/policybench.pdf",
-      "sha256": "6e334af2088be3258556c45f2865153865a5ce5368277a6924b4ce19bf6acc03"
+      "sha256": "660c2ecd7117dffbca7175f0f91f29fe05a4c3a1851184f441ab728259595b1d"
     },
     "web": {
       "path": "app/public/paper/web",
       "files": {
         "figures/global_leaderboard.png": "526511477880573c4e7cae98e3d966eb400eecf7e44c807ddcd4a5d377ebcd6f",
         "figures/positive_zero_scatter.png": "b15332fdda92c8f23269937c90968fb50327186fb154bc29729264586dc463d5",
-        "index.html": "1a45bc2d3d15c3592106fbaa2b5eeea4c2e23ceacf348c464e49387f8bb2b8b9",
+        "index.html": "a351f8d7c3e6178bd2388d1848bb28cb8819579d3d2ed591869c8bf0a7ec81da",
         "pe-tokens.css": "8f24d8da26f583c8ffddffcdcd172b6d52cbecfec20eda55bd39d7aa829f41d8",
         "policybench-theme.css": "0e12c5fd615558259e5bce0167a38424e54f9ceb280666c4afd660d759cd1cb9",
         "site_libs/clipboard/clipboard.min.js": "e17a1d816e13c0826e0ed7febfabc3277f45571234bde0bf9120829a7169edc9",
@@ -230,5 +230,10 @@
       "round_1/us/source_predictions.csv.gz": "1ebf2389786a79acc5fc9af3c6f900272d2f7e4ef8f87bbdf155995711b2f465",
       "round_1/us/target_rows.csv": "65804743a0d579f099826046e72a52785f122dba6d860552c27d6ea83b85dc8b"
     }
+  },
+  "population_weight_artifact": {
+    "path": "policybench/population_weights.json",
+    "sha256": "2a8d108946fd42e6c1a9d163f5e90ed342b8750c06217fc80411eb8eb10de5a6",
+    "note": "Output weights for the household and aggregate scoring views. US weights use the full Enhanced CPS; UK weights use the full enhanced FRS. These weights are fixed for scoring the 100-household snapshot."
   }
 }
diff --git a/.../runs/uk_full_run_20260513_policyengine_4_4_4_nested_outputs/analysis/report.md b/.../runs/uk_full_run_20260513_policyengine_4_4_4_nested_outputs/analysis/report.md
@@ -29,10 +29,10 @@ Total cost: `33.913` USD. Estimated total runtime: `473.1 min`.
 
 | model | mean_score | mean_exact | mean_within_1pct | mean_within_5pct | mean_within_10pct | mean_binary_accuracy | mean_mae | total_n |
 | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: |
-| gpt-5.5 | 0.806 | 0.707 | 0.757 | 0.847 | 0.914 | n/a | 245.595 | 700 |
 | gemini-3.1-pro-preview | 0.790 | 0.704 | 0.719 | 0.839 | 0.900 | n/a | 342.190 | 700 |
-| claude-sonnet-4.6 | 0.781 | 0.697 | 0.713 | 0.829 | 0.887 | n/a | 340.302 | 700 |
+| gpt-5.5 | 0.806 | 0.707 | 0.757 | 0.847 | 0.914 | n/a | 245.595 | 700 |
 | grok-4.20 | 0.781 | 0.703 | 0.713 | 0.820 | 0.890 | n/a | 355.731 | 700 |
+| claude-sonnet-4.6 | 0.781 | 0.697 | 0.713 | 0.829 | 0.887 | n/a | 340.302 | 700 |
 | claude-opus-4.7 | 0.781 | 0.699 | 0.711 | 0.826 | 0.887 | n/a | 419.425 | 700 |
 | gemini-3-flash-preview | 0.768 | 0.704 | 0.714 | 0.799 | 0.853 | n/a | 460.365 | 700 |
 | gemini-3.1-flash-lite-preview | 0.739 | 0.694 | 0.700 | 0.757 | 0.806 | n/a | 570.365 | 700 |
@@ -42,24 +42,24 @@ Total cost: `33.913` USD. Estimated total runtime: `473.1 min`.
 | grok-4.1-fast | 0.736 | 0.707 | 0.714 | 0.753 | 0.771 | n/a | 1094.751 | 700 |
 | gpt-5.4-nano | 0.692 | 0.686 | 0.686 | 0.696 | 0.701 | n/a | 1448.748 | 700 |
 
-## Bounded global variable weights (headline)
+## Population household-impact weights (headline)
 
-Households receive equal weight. The score is a weighted average of continuous row scores; each variable's weight is the mean across households of `|ref_ij| / max(|household_net_income_i|, sum_k |ref_ik|)`, renormalized so weights sum to 1.
+Households receive equal weight. The score is a weighted average of continuous row scores. Canonical country reports use full-population output-group weights: US weights come from the full Enhanced CPS, UK weights from the full enhanced FRS. Each output group's weight is the household-weighted mean of `|ref_ij| / max(|household_net_income_i|, sum_k |ref_ik|)` in the source population, renormalized so weights sum to 1.
 
 | model | bounded_score | amount_accuracy | participation_accuracy |
 | --- | ---: | ---: | ---: |
-| claude-haiku-4.5 | 0.733 | 0.454 | 0.929 |
-| claude-opus-4.7 | 0.886 | 0.788 | 0.969 |
-| claude-sonnet-4.6 | 0.896 | 0.800 | 0.973 |
-| gemini-3-flash-preview | 0.867 | 0.721 | 0.977 |
-| gemini-3.1-flash-lite-preview | 0.837 | 0.680 | 0.949 |
-| gemini-3.1-pro-preview | 0.903 | 0.802 | 0.977 |
-| gpt-5.4-mini | 0.707 | 0.370 | 0.901 |
-| gpt-5.4-nano | 0.660 | 0.346 | 0.900 |
-| gpt-5.5 | 0.903 | 0.817 | 0.987 |
-| grok-4.1-fast | 0.687 | 0.349 | 0.900 |
-| grok-4.20 | 0.893 | 0.787 | 0.977 |
-| grok-4.3 | 0.820 | 0.632 | 0.930 |
+| claude-haiku-4.5 | 0.737 | 0.446 | 0.929 |
+| claude-opus-4.7 | 0.889 | 0.782 | 0.969 |
+| claude-sonnet-4.6 | 0.896 | 0.795 | 0.973 |
+| gemini-3-flash-preview | 0.873 | 0.713 | 0.977 |
+| gemini-3.1-flash-lite-preview | 0.837 | 0.672 | 0.949 |
+| gemini-3.1-pro-preview | 0.911 | 0.801 | 0.977 |
+| gpt-5.4-mini | 0.711 | 0.363 | 0.901 |
+| gpt-5.4-nano | 0.664 | 0.338 | 0.900 |
+| gpt-5.5 | 0.909 | 0.812 | 0.987 |
+| grok-4.1-fast | 0.697 | 0.343 | 0.900 |
+| grok-4.20 | 0.900 | 0.785 | 0.977 |
+| grok-4.3 | 0.816 | 0.622 | 0.930 |
 
 
 ## Household-equal impact score (30% floor — legacy)

diff --git a/...runs/uk_full_run_20260513_policyengine_4_4_4_nested_outputs/analysis/summary_by_model.csv b/...runs/uk_full_run_20260513_policyengine_4_4_4_nested_outputs/analysis/summary_by_model.csv
@@ -1,13 +1,13 @@
 model,mean_score,mean_exact,mean_within_1pct,mean_within_5pct,mean_mae,mean_mape,mean_within_10pct,mean_accuracy,mean_coverage,total_n,parsed_n,bounded_score,amount_accuracy,participation_accuracy,equal_score,aggregate_score
-gpt-5.5,0.8064285714285714,0.7071428571428572,0.7571428571428571,0.8471428571428571,245.5950380074714,1.5414292346573133,0.9142857142857144,,1.0,700,700,0.903367930484206,0.8170099719066646,0.9871428571428571,0.954344792415381,0.9003379609121283
-gemini-3.1-pro-preview,0.7903571428571429,0.7042857142857143,0.7185714285714286,0.8385714285714286,342.1902230660652,1.6445076728317636,0.9,,1.0,700,700,0.9030466292474396,0.8021901966385488,0.9771428571428571,0.9447985279307974,0.9017934170481368
-claude-sonnet-4.6,0.7814285714285714,0.6971428571428572,0.7128571428571429,0.8285714285714285,340.30219302281733,1.5760235112441965,0.8871428571428571,,1.0,700,700,0.8964484983721506,0.7995679531351527,0.9728571428571429,0.9392503058676615,0.8959571381915014
-grok-4.20,0.7814285714285714,0.7028571428571428,0.7128571428571429,0.8200000000000001,355.73093788191335,1.654425144552594,0.89,,1.0,700,700,0.8934697406535484,0.7873825417645766,0.9771428571428571,0.9391958032416197,0.8897113670183131
-claude-opus-4.7,0.7807142857142857,0.6985714285714286,0.7114285714285715,0.8257142857142857,419.4251000636586,1.6175101777067462,0.8871428571428571,,1.0,700,700,0.8857859227666285,0.787528182105815,0.9685714285714285,0.9368854149249966,0.89178924178507
-gemini-3-flash-preview,0.7675000000000001,0.7042857142857143,0.7142857142857143,0.7985714285714286,460.36469078893066,14.15793020330454,0.8528571428571429,,1.0,700,700,0.8672262631724518,0.7214544158266778,0.9771428571428571,0.9268658948980719,0.8652141037123774
-gemini-3.1-flash-lite-preview,0.7392857142857144,0.6942857142857143,0.7000000000000001,0.7571428571428571,570.36474322509,0.4288882820234676,0.8057142857142857,,1.0,700,700,0.8365844881458274,0.6795921631551974,0.9485714285714286,0.896382101962189,0.8330152755003196
-grok-4.3,0.7421428571428572,0.6928571428571428,0.7014285714285714,0.7485714285714286,917.0486598859516,1.390015663014456,0.8257142857142857,,1.0,700,700,0.8196624837557729,0.6320521368671593,0.93,0.8875855328533853,0.8282265887480711
-claude-haiku-4.5,0.722857142857143,0.7014285714285714,0.7014285714285714,0.7342857142857143,1075.7582983389796,55.305595814215714,0.7542857142857143,,1.0,700,700,0.7325832852331737,0.4535978733326653,0.9285714285714286,0.84771671666668,0.7258787668181789
-gpt-5.4-mini,0.7278571428571429,0.7071428571428572,0.7071428571428572,0.7300000000000001,1001.7978197051054,0.6273665695018973,0.7671428571428571,,1.0,700,700,0.7065888130762589,0.3704812593043977,0.9014285714285715,0.8411461177829542,0.693424814057304
-grok-4.1-fast,0.7364285714285714,0.7071428571428572,0.7142857142857143,0.7528571428571428,1094.7510867597384,61.05852970735781,0.7714285714285715,,1.0,700,700,0.6872079068977911,0.34912417975348237,0.9,0.8399590878092994,0.6714187528504807
-gpt-5.4-nano,0.6921428571428571,0.6857142857142857,0.6857142857142857,0.6957142857142857,1448.7483450823104,37.29732095822304,0.7014285714285714,,1.0,700,700,0.6597045459086216,0.345723174953565,0.9,0.7784150343086543,0.6353918202596233
+gemini-3.1-pro-preview,0.7903571428571429,0.7042857142857143,0.7185714285714286,0.8385714285714286,342.1902230660652,1.6445076728317636,0.9,,1.0,700,700,0.9111517774705656,0.8013523761395511,0.9771428571428571,0.9447985279307974,0.8940839389579354
+gpt-5.5,0.8064285714285714,0.7071428571428572,0.7571428571428571,0.8471428571428571,245.5950380074714,1.5414292346573133,0.9142857142857144,,1.0,700,700,0.9093743375954415,0.8123781934228569,0.9871428571428571,0.954344792415381,0.8896668687255104
+grok-4.20,0.7814285714285714,0.7028571428571428,0.7128571428571429,0.8200000000000001,355.73093788191335,1.654425144552594,0.89,,1.0,700,700,0.8998278275696227,0.7847639647503688,0.9771428571428571,0.9391958032416197,0.8808862533184595
+claude-sonnet-4.6,0.7814285714285714,0.6971428571428572,0.7128571428571429,0.8285714285714285,340.30219302281733,1.5760235112441965,0.8871428571428571,,1.0,700,700,0.8960040458248818,0.795188820935437,0.9728571428571429,0.9392503058676615,0.8825301666744982
+claude-opus-4.7,0.7807142857142857,0.6985714285714286,0.7114285714285715,0.8257142857142857,419.4251000636586,1.6175101777067462,0.8871428571428571,,1.0,700,700,0.8889683344552282,0.7824543613753719,0.9685714285714285,0.9368854149249966,0.8833840171817674
+gemini-3-flash-preview,0.7675000000000001,0.7042857142857143,0.7142857142857143,0.7985714285714286,460.36469078893066,14.15793020330454,0.8528571428571429,,1.0,700,700,0.8727060087182099,0.7133828171805401,0.9771428571428571,0.9268658948980719,0.8530325349230929
+gemini-3.1-flash-lite-preview,0.7392857142857144,0.6942857142857143,0.7000000000000001,0.7571428571428571,570.36474322509,0.4288882820234676,0.8057142857142857,,1.0,700,700,0.8369532317005409,0.6717560986618268,0.9485714285714286,0.896382101962189,0.8193364174998848
+grok-4.3,0.7421428571428572,0.6928571428571428,0.7014285714285714,0.7485714285714286,917.0486598859516,1.390015663014456,0.8257142857142857,,1.0,700,700,0.8155889848213503,0.6219832713948583,0.93,0.8875855328533853,0.8172201569949634
+claude-haiku-4.5,0.722857142857143,0.7014285714285714,0.7014285714285714,0.7342857142857143,1075.7582983389796,55.305595814215714,0.7542857142857143,,1.0,700,700,0.7374240447218764,0.44550015751012395,0.9285714285714286,0.84771671666668,0.7008386952499461
+gpt-5.4-mini,0.7278571428571429,0.7071428571428572,0.7071428571428572,0.7300000000000001,1001.7978197051054,0.6273665695018973,0.7671428571428571,,1.0,700,700,0.711249890974841,0.3625943841763507,0.9014285714285715,0.8411461177829542,0.6631249243497433
+grok-4.1-fast,0.7364285714285714,0.7071428571428572,0.7142857142857143,0.7528571428571428,1094.7510867597384,61.05852970735781,0.7714285714285715,,1.0,700,700,0.6965521537810396,0.34254537308012356,0.9,0.8399590878092994,0.6386818367308976
+gpt-5.4-nano,0.6921428571428571,0.6857142857142857,0.6857142857142857,0.6957142857142857,1448.7483450823104,37.29732095822304,0.7014285714285714,,1.0,700,700,0.6640855643583242,0.3379562775663838,0.9,0.7784150343086543,0.6074794546270517
diff --git a/...r/snapshot/20260501/runs/uk_full_run_20260513_policyengine_4_4_4_nested_outputs/data.json b/...r/snapshot/20260501/runs/uk_full_run_20260513_policyengine_4_4_4_nested_outputs/data.json