Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file modified app/public/paper/policybench.pdf
Binary file not shown.
343 changes: 189 additions & 154 deletions app/public/paper/web/index.html

Large diffs are not rendered by default.

22 changes: 14 additions & 8 deletions app/src/components/Methodology.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -264,23 +264,29 @@ export default function Methodology({
facts. It excludes intermediate tax bases, payroll subcomponents, and
outputs that mainly require unavailable history, restricted local
market data, or program take-up assignment. WIC is scored as
person-level eligibility, not as a dollar amount.
person-level eligibility, not as a dollar amount. Local income tax
is retained as a displayed requested output, but currently receives
zero default population-impact weight because the full Enhanced CPS
source has no positive modeled local-income-tax records.
</SectionCard>

<SectionCard title="Scoring and weighting">
The public leaderboard uses bounded global variable weights. For each
household, the row score is{" "}
The public leaderboard uses population household-impact weights. For
each household, the row score is{" "}
<code>max(0, 1 − |pred − ref| / |ref|)</code> when the reference is
nonzero and matches exactly when the reference is zero; the same
formula handles boolean eligibility flags naturally because
ref ∈ {"{0, 1}"} gives a 0/1 score directly. Each household&apos;s
per-variable share is{" "}
ref ∈ {"{0, 1}"} gives a 0/1 score directly. Each full source
household&apos;s per-output share is{" "}
<code>|ref| / max(|household_net_income|, Σ |ref|)</code>, a value in
[0, 1] that&apos;s strictly less than one when net income dominates
the gross tax-benefit flow and equals one only when programs cancel
each other out. The mean of those shares across all benchmark
households is then renormalized so the global variable weights sum
to one, and those weights are applied to score every household.
each other out. Those shares are averaged using calibrated household
weights in the full source population, then renormalized so the
output weights sum to one. US weights use the full Enhanced CPS; UK
weights use the full enhanced FRS. The weights are then applied to
the fixed benchmark households and renormalized within each household
over requested outputs.
{country === "us"
? " Person-level eligibility flags like Medicaid carry weight through PolicyEngine's paired per-capita value (e.g. medicaid_value), so the LLM is graded only on the boolean call itself."
: " Person-level eligibility flags carry weight through PolicyEngine's paired per-capita value, so the LLM is graded only on the boolean call itself."}
Expand Down
2 changes: 1 addition & 1 deletion app/src/data.json

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions app/src/lib/sensitivity.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,13 @@ export const SENSITIVITY_VIEWS: SensitivityView[] = [
id: "household",
label: "Household",
description:
"Bounded global variable weights — each variable's per-household share is |ref| / max(|household_net_income|, Σ|ref|) (a value in [0, 1] that's < 1 when net income dominates and = 1 only when programs cancel out). Those shares are averaged across households and the resulting global weights are renormalized to sum to one before being applied to score every household. One household = one household, weight reflects each variable's net-income share.",
"Population household-impact weights — each output group's share is |ref| / max(|household_net_income|, Σ|ref|) in the full source microsimulation population, averaged with household weights and renormalized before scoring each benchmark household. US weights use the full Enhanced CPS; UK weights use the full enhanced FRS.",
},
{
id: "aggregate",
label: "Aggregate",
description:
"Budget-weighted — each variable's weight is its share of total absolute reference dollars across the benchmark, renormalized within each household so per-household weights sum to one. One dollar of impact = one dollar.",
"Budget-weighted — each output group's weight is its share of total absolute reference dollars in the full source microsimulation population, renormalized within each benchmark household. One dollar of impact = one dollar.",
},
{
id: "equal",
Expand Down
2 changes: 1 addition & 1 deletion app/tests/programFilters.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ import {
} from "../src/lib/programFilters";

function assertClose(actual: number | undefined, expected: number): void {
assert.notEqual(actual, undefined);
assert.ok(actual !== undefined);
assert.ok(Math.abs(actual - expected) < 1e-9);
}

Expand Down
76 changes: 29 additions & 47 deletions paper/index.qmd

Large diffs are not rendered by default.

21 changes: 13 additions & 8 deletions paper/snapshot/20260501/manifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,11 @@
"files": {
"analysis/impact_summary_by_model.csv": "280b6e996f9f5c437e9a5b30c1c814d8d139b33c69782a0d9585b71c8dd54865",
"analysis/metrics.csv": "67ff07b37d7fe2516c1453a1b8cf7ff15a37df219e954a80ac486f94a62d088c",
"analysis/report.md": "840031d5936233e9ea02474a1322f480412edb41189db7311ac18bc2f2e2f66d",
"analysis/summary_by_model.csv": "3816e8d00aee3d99678e194f71823f6e9658f99dfa9249d5b8fc5131b7c5bb5f",
"analysis/report.md": "43930f17d91dce2bde3a4f2119972e077c165e0fc0fe2d3b1976e3bef028a06b",
"analysis/summary_by_model.csv": "2836cb83d073abcf8b3108566867c7454f19489793faaf3cc39fe457a6e99115",
"analysis/summary_by_variable.csv": "a97f73edf7473018ee4481be7e3bc714618e48c846d9427be66f4faba3ce394c",
"analysis/usage_summary.csv": "9124504feb192a1fe635022f9acecd7599996e61ffad6036fa16a6bad4335040",
"data.json": "efcbce10da14ffa2615519b52e8f5b156a40279ac931fd628031bf6e009a9d7f",
"data.json": "1047cd8b676dd79b5fde3a15ae74967615fdedb352f5f3a4812fdaa85f58fca4",
"predictions.csv.gz": "46c030a3d9cb3a2e90652f99bc741e32db7c4f2499018417e75497ecbada74f3",
"reference_outputs.csv": "04febaadd091dfa7de97fcf71a8a52d3db2296ca98fe1c97f9be92c4bf1383ce",
"reference_outputs.csv.meta.json": "4db925a1841ce8549d99e3e69cd5796dab3ca960da08a6108185f5cbc0190f8c",
Expand All @@ -35,11 +35,11 @@
"files": {
"analysis/impact_summary_by_model.csv": "37e1fba0d40f9edbc1493581ce0daf5cd413c0bc39b0d6a57f99c3f248a544b2",
"analysis/metrics.csv": "7645e9ed00018a3e68c4ab935ccb42e5c0ce49539eee92c9c1d5e5bad19d4315",
"analysis/report.md": "74ae0cd6bbef6c8257f8ee4c38718c0f4a4941a3db3eeffb13b71f6cdaf9e83f",
"analysis/summary_by_model.csv": "f9e3c7539b8037819e6deb57a8dd503b8f4d9bcce750560da54927a6d32684ce",
"analysis/report.md": "f6a7cb48af2778e53c06a9fcadb40dd84d0a7b95b586d214e0279b16b4c233a1",
"analysis/summary_by_model.csv": "1c9cb7d669f4ae5a47afb08a3d31dced85b7de5b17865f110aedb099809e15b2",
"analysis/summary_by_variable.csv": "bc1117874633598cda8ac2c608d99b6bf5e34e8e8907256e8570702e24bc6e18",
"analysis/usage_summary.csv": "a8f7f43ed996daf28ccbd3d8d514b86c90fca11146e66eb1fc86b9c08ff18b66",
"data.json": "9c1c2a47021432c571d19ac2ebcfedd1c52e530aa1fade7d2269528cd0229aa7",
"data.json": "2dd2327856b830fdc5043bf45e8a0e1357a95020cc10286da8aae1be8fd3d6e3",
"predictions.csv.gz": "34502eb863e0e67e149b852c9a9676032a105044ced5c97898d1d1f59b08013c",
"reference_outputs.csv": "6286a238e54a0b948385a60a73f36a1aae3cb9bf6612fa28c530d0fd3e024aae",
"reference_outputs.csv.meta.json": "5634984705ff8312936a2b57fdc339efef4747ee270949a2caa23a7d203da9e8",
Expand All @@ -59,14 +59,14 @@
"rendered_paper_artifacts": {
"pdf": {
"path": "app/public/paper/policybench.pdf",
"sha256": "6e334af2088be3258556c45f2865153865a5ce5368277a6924b4ce19bf6acc03"
"sha256": "660c2ecd7117dffbca7175f0f91f29fe05a4c3a1851184f441ab728259595b1d"
},
"web": {
"path": "app/public/paper/web",
"files": {
"figures/global_leaderboard.png": "526511477880573c4e7cae98e3d966eb400eecf7e44c807ddcd4a5d377ebcd6f",
"figures/positive_zero_scatter.png": "b15332fdda92c8f23269937c90968fb50327186fb154bc29729264586dc463d5",
"index.html": "1a45bc2d3d15c3592106fbaa2b5eeea4c2e23ceacf348c464e49387f8bb2b8b9",
"index.html": "a351f8d7c3e6178bd2388d1848bb28cb8819579d3d2ed591869c8bf0a7ec81da",
"pe-tokens.css": "8f24d8da26f583c8ffddffcdcd172b6d52cbecfec20eda55bd39d7aa829f41d8",
"policybench-theme.css": "0e12c5fd615558259e5bce0167a38424e54f9ceb280666c4afd660d759cd1cb9",
"site_libs/clipboard/clipboard.min.js": "e17a1d816e13c0826e0ed7febfabc3277f45571234bde0bf9120829a7169edc9",
Expand Down Expand Up @@ -230,5 +230,10 @@
"round_1/us/source_predictions.csv.gz": "1ebf2389786a79acc5fc9af3c6f900272d2f7e4ef8f87bbdf155995711b2f465",
"round_1/us/target_rows.csv": "65804743a0d579f099826046e72a52785f122dba6d860552c27d6ea83b85dc8b"
}
},
"population_weight_artifact": {
"path": "policybench/population_weights.json",
"sha256": "2a8d108946fd42e6c1a9d163f5e90ed342b8750c06217fc80411eb8eb10de5a6",
"note": "Output weights for the household and aggregate scoring views. US weights use the full Enhanced CPS; UK weights use the full enhanced FRS. These weights are fixed for scoring the 100-household snapshot."
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,10 @@ Total cost: `33.913` USD. Estimated total runtime: `473.1 min`.

| model | mean_score | mean_exact | mean_within_1pct | mean_within_5pct | mean_within_10pct | mean_binary_accuracy | mean_mae | total_n |
| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: |
| gpt-5.5 | 0.806 | 0.707 | 0.757 | 0.847 | 0.914 | n/a | 245.595 | 700 |
| gemini-3.1-pro-preview | 0.790 | 0.704 | 0.719 | 0.839 | 0.900 | n/a | 342.190 | 700 |
| claude-sonnet-4.6 | 0.781 | 0.697 | 0.713 | 0.829 | 0.887 | n/a | 340.302 | 700 |
| gpt-5.5 | 0.806 | 0.707 | 0.757 | 0.847 | 0.914 | n/a | 245.595 | 700 |
| grok-4.20 | 0.781 | 0.703 | 0.713 | 0.820 | 0.890 | n/a | 355.731 | 700 |
| claude-sonnet-4.6 | 0.781 | 0.697 | 0.713 | 0.829 | 0.887 | n/a | 340.302 | 700 |
| claude-opus-4.7 | 0.781 | 0.699 | 0.711 | 0.826 | 0.887 | n/a | 419.425 | 700 |
| gemini-3-flash-preview | 0.768 | 0.704 | 0.714 | 0.799 | 0.853 | n/a | 460.365 | 700 |
| gemini-3.1-flash-lite-preview | 0.739 | 0.694 | 0.700 | 0.757 | 0.806 | n/a | 570.365 | 700 |
Expand All @@ -42,24 +42,24 @@ Total cost: `33.913` USD. Estimated total runtime: `473.1 min`.
| grok-4.1-fast | 0.736 | 0.707 | 0.714 | 0.753 | 0.771 | n/a | 1094.751 | 700 |
| gpt-5.4-nano | 0.692 | 0.686 | 0.686 | 0.696 | 0.701 | n/a | 1448.748 | 700 |

## Bounded global variable weights (headline)
## Population household-impact weights (headline)

Households receive equal weight. The score is a weighted average of continuous row scores; each variable's weight is the mean across households of `|ref_ij| / max(|household_net_income_i|, sum_k |ref_ik|)`, renormalized so weights sum to 1.
Households receive equal weight. The score is a weighted average of continuous row scores. Canonical country reports use full-population output-group weights: US weights come from the full Enhanced CPS, UK weights from the full enhanced FRS. Each output group's weight is the household-weighted mean of `|ref_ij| / max(|household_net_income_i|, sum_k |ref_ik|)` in the source population, renormalized so weights sum to 1.

| model | bounded_score | amount_accuracy | participation_accuracy |
| --- | ---: | ---: | ---: |
| claude-haiku-4.5 | 0.733 | 0.454 | 0.929 |
| claude-opus-4.7 | 0.886 | 0.788 | 0.969 |
| claude-sonnet-4.6 | 0.896 | 0.800 | 0.973 |
| gemini-3-flash-preview | 0.867 | 0.721 | 0.977 |
| gemini-3.1-flash-lite-preview | 0.837 | 0.680 | 0.949 |
| gemini-3.1-pro-preview | 0.903 | 0.802 | 0.977 |
| gpt-5.4-mini | 0.707 | 0.370 | 0.901 |
| gpt-5.4-nano | 0.660 | 0.346 | 0.900 |
| gpt-5.5 | 0.903 | 0.817 | 0.987 |
| grok-4.1-fast | 0.687 | 0.349 | 0.900 |
| grok-4.20 | 0.893 | 0.787 | 0.977 |
| grok-4.3 | 0.820 | 0.632 | 0.930 |
| claude-haiku-4.5 | 0.737 | 0.446 | 0.929 |
| claude-opus-4.7 | 0.889 | 0.782 | 0.969 |
| claude-sonnet-4.6 | 0.896 | 0.795 | 0.973 |
| gemini-3-flash-preview | 0.873 | 0.713 | 0.977 |
| gemini-3.1-flash-lite-preview | 0.837 | 0.672 | 0.949 |
| gemini-3.1-pro-preview | 0.911 | 0.801 | 0.977 |
| gpt-5.4-mini | 0.711 | 0.363 | 0.901 |
| gpt-5.4-nano | 0.664 | 0.338 | 0.900 |
| gpt-5.5 | 0.909 | 0.812 | 0.987 |
| grok-4.1-fast | 0.697 | 0.343 | 0.900 |
| grok-4.20 | 0.900 | 0.785 | 0.977 |
| grok-4.3 | 0.816 | 0.622 | 0.930 |


## Household-equal impact score (30% floor — legacy)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
model,mean_score,mean_exact,mean_within_1pct,mean_within_5pct,mean_mae,mean_mape,mean_within_10pct,mean_accuracy,mean_coverage,total_n,parsed_n,bounded_score,amount_accuracy,participation_accuracy,equal_score,aggregate_score
gpt-5.5,0.8064285714285714,0.7071428571428572,0.7571428571428571,0.8471428571428571,245.5950380074714,1.5414292346573133,0.9142857142857144,,1.0,700,700,0.903367930484206,0.8170099719066646,0.9871428571428571,0.954344792415381,0.9003379609121283
gemini-3.1-pro-preview,0.7903571428571429,0.7042857142857143,0.7185714285714286,0.8385714285714286,342.1902230660652,1.6445076728317636,0.9,,1.0,700,700,0.9030466292474396,0.8021901966385488,0.9771428571428571,0.9447985279307974,0.9017934170481368
claude-sonnet-4.6,0.7814285714285714,0.6971428571428572,0.7128571428571429,0.8285714285714285,340.30219302281733,1.5760235112441965,0.8871428571428571,,1.0,700,700,0.8964484983721506,0.7995679531351527,0.9728571428571429,0.9392503058676615,0.8959571381915014
grok-4.20,0.7814285714285714,0.7028571428571428,0.7128571428571429,0.8200000000000001,355.73093788191335,1.654425144552594,0.89,,1.0,700,700,0.8934697406535484,0.7873825417645766,0.9771428571428571,0.9391958032416197,0.8897113670183131
claude-opus-4.7,0.7807142857142857,0.6985714285714286,0.7114285714285715,0.8257142857142857,419.4251000636586,1.6175101777067462,0.8871428571428571,,1.0,700,700,0.8857859227666285,0.787528182105815,0.9685714285714285,0.9368854149249966,0.89178924178507
gemini-3-flash-preview,0.7675000000000001,0.7042857142857143,0.7142857142857143,0.7985714285714286,460.36469078893066,14.15793020330454,0.8528571428571429,,1.0,700,700,0.8672262631724518,0.7214544158266778,0.9771428571428571,0.9268658948980719,0.8652141037123774
gemini-3.1-flash-lite-preview,0.7392857142857144,0.6942857142857143,0.7000000000000001,0.7571428571428571,570.36474322509,0.4288882820234676,0.8057142857142857,,1.0,700,700,0.8365844881458274,0.6795921631551974,0.9485714285714286,0.896382101962189,0.8330152755003196
grok-4.3,0.7421428571428572,0.6928571428571428,0.7014285714285714,0.7485714285714286,917.0486598859516,1.390015663014456,0.8257142857142857,,1.0,700,700,0.8196624837557729,0.6320521368671593,0.93,0.8875855328533853,0.8282265887480711
claude-haiku-4.5,0.722857142857143,0.7014285714285714,0.7014285714285714,0.7342857142857143,1075.7582983389796,55.305595814215714,0.7542857142857143,,1.0,700,700,0.7325832852331737,0.4535978733326653,0.9285714285714286,0.84771671666668,0.7258787668181789
gpt-5.4-mini,0.7278571428571429,0.7071428571428572,0.7071428571428572,0.7300000000000001,1001.7978197051054,0.6273665695018973,0.7671428571428571,,1.0,700,700,0.7065888130762589,0.3704812593043977,0.9014285714285715,0.8411461177829542,0.693424814057304
grok-4.1-fast,0.7364285714285714,0.7071428571428572,0.7142857142857143,0.7528571428571428,1094.7510867597384,61.05852970735781,0.7714285714285715,,1.0,700,700,0.6872079068977911,0.34912417975348237,0.9,0.8399590878092994,0.6714187528504807
gpt-5.4-nano,0.6921428571428571,0.6857142857142857,0.6857142857142857,0.6957142857142857,1448.7483450823104,37.29732095822304,0.7014285714285714,,1.0,700,700,0.6597045459086216,0.345723174953565,0.9,0.7784150343086543,0.6353918202596233
gemini-3.1-pro-preview,0.7903571428571429,0.7042857142857143,0.7185714285714286,0.8385714285714286,342.1902230660652,1.6445076728317636,0.9,,1.0,700,700,0.9111517774705656,0.8013523761395511,0.9771428571428571,0.9447985279307974,0.8940839389579354
gpt-5.5,0.8064285714285714,0.7071428571428572,0.7571428571428571,0.8471428571428571,245.5950380074714,1.5414292346573133,0.9142857142857144,,1.0,700,700,0.9093743375954415,0.8123781934228569,0.9871428571428571,0.954344792415381,0.8896668687255104
grok-4.20,0.7814285714285714,0.7028571428571428,0.7128571428571429,0.8200000000000001,355.73093788191335,1.654425144552594,0.89,,1.0,700,700,0.8998278275696227,0.7847639647503688,0.9771428571428571,0.9391958032416197,0.8808862533184595
claude-sonnet-4.6,0.7814285714285714,0.6971428571428572,0.7128571428571429,0.8285714285714285,340.30219302281733,1.5760235112441965,0.8871428571428571,,1.0,700,700,0.8960040458248818,0.795188820935437,0.9728571428571429,0.9392503058676615,0.8825301666744982
claude-opus-4.7,0.7807142857142857,0.6985714285714286,0.7114285714285715,0.8257142857142857,419.4251000636586,1.6175101777067462,0.8871428571428571,,1.0,700,700,0.8889683344552282,0.7824543613753719,0.9685714285714285,0.9368854149249966,0.8833840171817674
gemini-3-flash-preview,0.7675000000000001,0.7042857142857143,0.7142857142857143,0.7985714285714286,460.36469078893066,14.15793020330454,0.8528571428571429,,1.0,700,700,0.8727060087182099,0.7133828171805401,0.9771428571428571,0.9268658948980719,0.8530325349230929
gemini-3.1-flash-lite-preview,0.7392857142857144,0.6942857142857143,0.7000000000000001,0.7571428571428571,570.36474322509,0.4288882820234676,0.8057142857142857,,1.0,700,700,0.8369532317005409,0.6717560986618268,0.9485714285714286,0.896382101962189,0.8193364174998848
grok-4.3,0.7421428571428572,0.6928571428571428,0.7014285714285714,0.7485714285714286,917.0486598859516,1.390015663014456,0.8257142857142857,,1.0,700,700,0.8155889848213503,0.6219832713948583,0.93,0.8875855328533853,0.8172201569949634
claude-haiku-4.5,0.722857142857143,0.7014285714285714,0.7014285714285714,0.7342857142857143,1075.7582983389796,55.305595814215714,0.7542857142857143,,1.0,700,700,0.7374240447218764,0.44550015751012395,0.9285714285714286,0.84771671666668,0.7008386952499461
gpt-5.4-mini,0.7278571428571429,0.7071428571428572,0.7071428571428572,0.7300000000000001,1001.7978197051054,0.6273665695018973,0.7671428571428571,,1.0,700,700,0.711249890974841,0.3625943841763507,0.9014285714285715,0.8411461177829542,0.6631249243497433
grok-4.1-fast,0.7364285714285714,0.7071428571428572,0.7142857142857143,0.7528571428571428,1094.7510867597384,61.05852970735781,0.7714285714285715,,1.0,700,700,0.6965521537810396,0.34254537308012356,0.9,0.8399590878092994,0.6386818367308976
gpt-5.4-nano,0.6921428571428571,0.6857142857142857,0.6857142857142857,0.6957142857142857,1448.7483450823104,37.29732095822304,0.7014285714285714,,1.0,700,700,0.6640855643583242,0.3379562775663838,0.9,0.7784150343086543,0.6074794546270517

Large diffs are not rendered by default.

Loading