diff --git a/app/package.json b/app/package.json index 93015e3..8b7fe1a 100644 --- a/app/package.json +++ b/app/package.json @@ -8,7 +8,8 @@ "dev": "next dev --turbopack", "build": "next build", "start": "next start", - "lint": "eslint . --max-warnings=0" + "lint": "eslint . --max-warnings=0", + "test": "bun test tests" }, "devDependencies": { "@tailwindcss/postcss": "^4.2.4", diff --git a/app/public/metric-options.html b/app/public/metric-options.html index a44b0ee..6ffbd43 100644 --- a/app/public/metric-options.html +++ b/app/public/metric-options.html @@ -525,7 +525,7 @@

How should we score a model across many outputs?

1The setup

- Three households, five outputs (four dollar amounts plus one + Three households, four outputs (three dollar amounts plus one eligibility flag), two models. Every step on this page uses the same numbers.

@@ -656,9 +656,9 @@

3Equal weights

- Average the five row scores into a household score, then + Average the four row scores into a household score, then average the household scores into a model score. Each output - gets a weight of 1/5 in every household. + gets a weight of 1/4 in every household.

@@ -695,7 +695,7 @@

4Per-household impact weights

Now an output gets weight in proportion to its dollar size for that household. A $5,000 income tax line carries more - weight than a $500 ACA premium tax credit. + weight than a $500 benefit line.

@@ -877,9 +877,8 @@

Worked examples on realistic households

The "share of net income that flowed through this program" reading is intuitive. Medicaid eligibility carries a meaningful weight because it shows up with sizeable paired - values in two of the four households; ACA PTC carries - smaller weight because only one household uses it. None - of these numbers required a tunable parameter. + values in two of the four households. None of these numbers + required a tunable parameter.

@@ -1309,7 +1308,6 @@

12Side by side

{ id: "income_tax", label: "Income tax", kind: "amount" }, { id: "payroll_tax", label: "Payroll tax", kind: "amount" }, { id: "snap", label: "SNAP", kind: "amount" }, - { id: "aca_ptc", label: "ACA PTC", kind: "amount" }, { id: "medicaid_eligible", label: "Medicaid eligibility", @@ -1329,7 +1327,6 @@

12Side by side

income_tax: -4000, payroll_tax: -5000, snap: 0, - aca_ptc: 0, medicaid_eligible: false, medicaid_value: 0, }, @@ -1342,7 +1339,6 @@

12Side by side

income_tax: -1000, payroll_tax: -1500, snap: 6000, - aca_ptc: 8000, medicaid_eligible: true, medicaid_value: 6000, }, @@ -1355,7 +1351,6 @@

12Side by side

income_tax: -2000, payroll_tax: 0, snap: 2400, - aca_ptc: 0, medicaid_eligible: true, medicaid_value: 9000, }, @@ -1374,7 +1369,6 @@

12Side by side

income_tax: 3000, payroll_tax: -1530, snap: 5000, - aca_ptc: 3000, medicaid_eligible: true, medicaid_value: 8000, }, @@ -1387,7 +1381,6 @@

12Side by side

income_tax: -4500, payroll_tax: -4590, snap: 0, - aca_ptc: 0, medicaid_eligible: false, medicaid_value: 0, }, @@ -1400,7 +1393,6 @@

12Side by side

income_tax: 0, payroll_tax: 0, snap: 2400, - aca_ptc: 0, medicaid_eligible: true, medicaid_value: 9000, }, @@ -1413,7 +1405,6 @@

12Side by side

income_tax: -40000, payroll_tax: -9900, snap: 0, - aca_ptc: 0, medicaid_eligible: false, medicaid_value: 0, }, @@ -1425,18 +1416,18 @@

12Side by side

id: "tax", label: "Tax-focused", predictions: { - H1: { income_tax: -4000, payroll_tax: -5000, snap: 0, aca_ptc: 0, medicaid_eligible: false }, - H2: { income_tax: -1000, payroll_tax: -1500, snap: 0, aca_ptc: 0, medicaid_eligible: false }, - H3: { income_tax: -2000, payroll_tax: 0, snap: 0, aca_ptc: 0, medicaid_eligible: false }, + H1: { income_tax: -4000, payroll_tax: -5000, snap: 0, medicaid_eligible: false }, + H2: { income_tax: -1000, payroll_tax: -1500, snap: 0, medicaid_eligible: false }, + H3: { income_tax: -2000, payroll_tax: 0, snap: 0, medicaid_eligible: false }, }, }, { id: "benefit", label: "Benefit-focused", predictions: { - H1: { income_tax: -2000, payroll_tax: -4000, snap: 0, aca_ptc: 0, medicaid_eligible: false }, - H2: { income_tax: 0, payroll_tax: -1000, snap: 6000, aca_ptc: 8000, medicaid_eligible: true }, - H3: { income_tax: -1000, payroll_tax: 0, snap: 2400, aca_ptc: 0, medicaid_eligible: true }, + H1: { income_tax: -2000, payroll_tax: -4000, snap: 0, medicaid_eligible: false }, + H2: { income_tax: 0, payroll_tax: -1000, snap: 6000, medicaid_eligible: true }, + H3: { income_tax: -1000, payroll_tax: 0, snap: 2400, medicaid_eligible: true }, }, }, ]; diff --git a/app/public/paper/web/index.html b/app/public/paper/web/index.html index f9d3528..a3f0adb 100644 --- a/app/public/paper/web/index.html +++ b/app/public/paper/web/index.html @@ -362,7 +362,7 @@

Frozen snapshot an 30 Output groups -19 US and 7 UK +18 US and 7 UK 31 @@ -539,7 +539,7 @@

Frozen snapshot an

Data and scenario construction

United States

The US benchmark is built from Enhanced Current Population Survey (CPS)-derived households using PolicyEngine US. The sampled households are filtered to keep a single-tax-unit, single-family, single-Supplemental Poverty Measure (SPM)-unit structure with at least one adult and a supported filing status. The 2024 Enhanced CPS source contains 41,314 households; 30,173 (73.0%) pass the filter and form the eligible draw. The 27.0% excluded by the filter include multi-tax-unit households (e.g., adult roommates), multi-family households, multi-SPM-unit households, and households whose head reports a filing status outside the supported set. These excluded compositions are exactly the kind of cases where federal/state credit allocations and benefit-unit rules become hardest, so the eligible draw is a tractable subset rather than the full distribution of US households. Prompts include nonzero promptable raw inputs across relevant entities rather than a hand-curated summary, so the models see many of the same facts the simulator receives. Filing status is not stated in the prompt; the reference computation infers it from tax-unit role flags. Models therefore see the same household facts that drive the reference filing-status assignment, but they do not receive that assignment as a label.

-

The current US release evaluates 19 output groups spanning federal income tax, refundable credits, payroll and self-employment tax, state and local income tax, Supplemental Nutrition Assistance Program (SNAP), Supplemental Security Income (SSI), Temporary Assistance for Needy Families (TANF), Affordable Care Act (ACA) premium tax credits, school-meal eligibility, and person-level coverage eligibility for the Special Supplemental Nutrition Program for Women, Infants, and Children (WIC), Medicaid, the Children’s Health Insurance Program (CHIP), Medicare, Head Start, and Early Head Start.

+

The current US release evaluates 18 output groups spanning federal income tax, refundable credits, payroll and self-employment tax, state and local income tax, Supplemental Nutrition Assistance Program (SNAP), Supplemental Security Income (SSI), Temporary Assistance for Needy Families (TANF), school-meal eligibility, and person-level coverage eligibility for the Special Supplemental Nutrition Program for Women, Infants, and Children (WIC), Medicaid, the Children’s Health Insurance Program (CHIP), Medicare, Head Start, and Early Head Start.

The output scope is intentionally narrower than the full PolicyEngine model. Table 3 summarizes the inclusion rule. The benchmark asks for WIC eligibility rather than a WIC dollar amount; WIC dollar values are used only as impact-weight proxies for coverage flags, not as requested model outputs.

@@ -583,18 +583,13 @@

United States

Excluded Intermediate tax bases, payroll subcomponents, and outputs that mainly require unavailable history, restricted local market data, restricted program-administration data, or take-up assignment rather than rule calculation. - -2 -ACA Premium Tax Credit -Retained as a deliberate health-support output; when local benchmark premiums are not listed, the model must estimate them from the household facts. - -3 +2 Binary coverage outputs Requested as 0/1 eligibility flags and scored as classification tasks; their dollar values are used only as impact-weight proxies, not as requested model outputs. -4 +3 WIC The benchmark asks for person-level WIC eligibility. It does not ask models to estimate a WIC dollar amount. @@ -1999,7 +1994,7 @@

Appendix A: Structu 4 Final parse coverage -The repaired manuscript snapshot has zero missing parsed numeric values and zero missing explanations across all 34,656 model-output rows. +The repaired manuscript snapshot has zero missing parsed numeric values and zero missing explanations across all 33,456 model-output rows. 5 @@ -2690,4 +2685,4 @@

Competing interests

- \ No newline at end of file + diff --git a/app/src/App.tsx b/app/src/App.tsx index 53f2255..78e5a09 100644 --- a/app/src/App.tsx +++ b/app/src/App.tsx @@ -1,6 +1,6 @@ "use client"; -import { useEffect, useMemo, useRef, useState } from "react"; +import { useCallback, useEffect, useMemo, useRef, useState } from "react"; import rawData from "./data.json"; import Hero from "./components/Hero"; import FailureModes from "./components/FailureModes"; @@ -8,42 +8,42 @@ import Methodology from "./components/Methodology"; import ModelLeaderboard from "./components/ModelLeaderboard"; import ProgramHeatmap from "./components/ProgramHeatmap"; import ScenarioExplorer from "./components/ScenarioExplorer"; -import type { BenchData, CountryCode, DashboardBundle, ViewKey } from "./types"; +import { filterExcludedOutputs } from "./lib/dashboardFilter"; +import { + buildProgramOptions, + resolveActiveProgramIds, + selectOnlyProgram as selectOnlyProgramFilter, + toggleProgramSelection, +} from "./lib/programFilters"; +import type { CountryCode, DashboardBundle, ViewKey } from "./types"; import { VIEW_LABELS } from "./types"; -const dashboard = rawData as DashboardBundle; +const dashboard = filterExcludedOutputs(rawData as DashboardBundle); export type { DashboardBundle } from "./types"; const COUNTRY_NAV_ITEMS = [ { id: "models", label: "Models" }, + { id: "programs", label: "Programs" }, { id: "scenarios", label: "Scenarios" }, { id: "failure-modes", label: "Failure" }, - { id: "programs", label: "Programs" }, - { id: "methodology", label: "Method" }, -] as const; - -const GLOBAL_NAV_ITEMS = [ - { id: "models", label: "Models" }, { id: "methodology", label: "Method" }, ] as const; const COUNTRY_ORDER: CountryCode[] = ["us", "uk"]; +const COUNTRY_ROUTE_HREFS: Record = { + us: "/us", + uk: "/uk", +}; -function getAvailableViews(dashboard: DashboardBundle): ViewKey[] { - const countryViews = COUNTRY_ORDER.filter((country) => dashboard.countries[country]); - if (dashboard.global?.modelStats.length && countryViews.length > 1) { - return ["global", ...countryViews]; - } - return countryViews; +function getAvailableViews(dashboard: DashboardBundle): CountryCode[] { + return COUNTRY_ORDER.filter((country) => dashboard.countries[country]); } -/** Map IANA timezone or BCP-47 language to a benchmark country, when we can tell. */ -function detectVisitorCountry( - availableViews: readonly ViewKey[], -): CountryCode | null { +/** Default UK visitors to the UK benchmark; everyone else starts on the US benchmark. */ +function detectVisitorCountry(availableViews: readonly CountryCode[]): CountryCode { if (typeof window === "undefined" || typeof navigator === "undefined") { - return null; + return availableViews.includes("us") ? "us" : (availableViews[0] ?? "us"); } let timezone = ""; try { @@ -53,11 +53,6 @@ function detectVisitorCountry( } const langs = (navigator.languages ?? [navigator.language ?? ""]) .map((value) => value.toLowerCase()); - const matchesUS = - timezone.startsWith("America/") || - timezone === "Pacific/Honolulu" || - timezone === "Pacific/Pago_Pago" || - langs.some((lang) => lang === "en-us" || lang.endsWith("-us")); const matchesUK = timezone === "Europe/London" || timezone === "Europe/Belfast" || @@ -67,42 +62,86 @@ function detectVisitorCountry( langs.some((lang) => ["en-gb", "cy-gb", "gd-gb", "en-uk"].includes(lang), ); - if (matchesUS && availableViews.includes("us")) return "us"; if (matchesUK && availableViews.includes("uk")) return "uk"; - return null; + return availableViews.includes("us") ? "us" : (availableViews[0] ?? "us"); } -export default function App() { +export default function App({ initialView }: { initialView?: CountryCode } = {}) { const availableViews = useMemo(() => getAvailableViews(dashboard), []); - // Default to the global leaderboard. After mount we try to switch to the - // visitor's country (US or UK) when we can detect it from timezone or - // navigator.language; if neither matches we stay on Global. - const initialView: ViewKey = availableViews.includes("global") - ? "global" - : (availableViews[0] ?? "global"); - const [selectedView, setSelectedView] = useState(initialView); + // Default to the US benchmark, then switch UK visitors after mount when + // timezone or browser language gives us a clear signal. + const defaultView: CountryCode = availableViews.includes("us") + ? "us" + : (availableViews[0] ?? "us"); + const routeView = + initialView && availableViews.includes(initialView) ? initialView : null; + const [localView, setLocalView] = useState( + routeView ?? defaultView, + ); + const selectedView = routeView ?? localView; const [hasUserPickedView, setHasUserPickedView] = useState(false); + const [selectedPrograms, setSelectedPrograms] = useState>( + () => new Set(), + ); + const [activeNav, setActiveNav] = useState("models"); + const observerRef = useRef(null); useEffect(() => { + if (routeView) return; if (hasUserPickedView) return; const detected = detectVisitorCountry(availableViews); - if (detected && detected !== selectedView) { - setSelectedView(detected); + if (detected !== localView) { + setLocalView(detected); } // We only want this auto-pick to run once per session; further changes // come from the user clicking the country selector. // eslint-disable-next-line react-hooks/exhaustive-deps }, []); - const [activeNav, setActiveNav] = useState("models"); - const observerRef = useRef(null); - const isGlobal = selectedView === "global"; - const data = isGlobal - ? dashboard.global! - : dashboard.countries[selectedView as CountryCode]!; - const navItems = isGlobal ? GLOBAL_NAV_ITEMS : COUNTRY_NAV_ITEMS; + const data = dashboard.countries[selectedView]!; + const navItems = COUNTRY_NAV_ITEMS; + const viewHrefs = useMemo(() => { + return Object.fromEntries( + availableViews.map((view) => [view, COUNTRY_ROUTE_HREFS[view]]), + ) as Partial>; + }, [availableViews]); + + const programOptions = useMemo(() => buildProgramOptions(data), [data]); + + const programOptionIds = useMemo( + () => programOptions.map((option) => option.variable), + [programOptions], + ); + + const activeProgramIds = useMemo(() => { + return resolveActiveProgramIds(programOptionIds, selectedPrograms); + }, [programOptionIds, selectedPrograms]); + + const activeProgramSummary = + activeProgramIds.size < programOptions.length + ? `${activeProgramIds.size} of ${programOptions.length} selected` + : `All ${programOptions.length} programs`; + + const resetPrograms = useCallback(() => { + setSelectedPrograms(new Set()); + }, []); + + const toggleProgram = useCallback( + (variable: string) => { + setSelectedPrograms((previous) => { + return toggleProgramSelection(programOptionIds, previous, variable); + }); + }, + [programOptionIds], + ); + + const selectOnlyProgram = useCallback((variable: string) => { + setSelectedPrograms(selectOnlyProgramFilter(variable)); + }, []); + const handleSelectView = (view: ViewKey) => { - setSelectedView(view); + if (view === "global") return; + setLocalView(view); setHasUserPickedView(true); setActiveNav("models"); }; @@ -139,19 +178,10 @@ export default function App() { ); const footerCopy = useMemo(() => { - if (isGlobal) { - const totalHouseholds = Object.values(dashboard.countries).reduce( - (sum, country) => sum + Object.keys(country?.scenarios ?? {}).length, - 0 - ); - const countryCount = Object.keys(dashboard.countries).length; - return `PolicyBench — global leaderboard across ${dashboard.global?.sharedModelCount ?? 0} shared frontier models, ${countryCount} country benchmarks, and ${totalHouseholds.toLocaleString()} households.`; - } - - const countryData = data as BenchData; + const countryData = data; const scoredRows = noToolsModels.reduce((sum, model) => sum + model.n, 0); return `PolicyBench — ${VIEW_LABELS[countryData.country]} benchmark with ${scoredRows.toLocaleString()} scored outputs across ${noToolsModels.length} frontier models, ${countryData.programStats.length} programs, and ${Object.keys(countryData.scenarios).length} household scenarios.`; - }, [data, isGlobal, noToolsModels]); + }, [data, noToolsModels]); return (
@@ -168,6 +198,7 @@ export default function App() { dashboard={dashboard} data={data} availableViews={availableViews} + viewHrefs={viewHrefs} navItems={navItems} activeNav={activeNav} /> @@ -183,33 +214,49 @@ export default function App() { data={data} selectedView={selectedView} dashboard={dashboard} + programOptions={programOptions} + activeProgramIds={activeProgramIds} + activeProgramSummary={activeProgramSummary} + onResetPrograms={resetPrograms} + onToggleProgram={toggleProgram} + onSelectOnlyProgram={selectOnlyProgram} + /> + + +
+
+
- {!isGlobal && ( - <> -
-
- -
- -
-
- -
- -
-
- -
- - )} +
+
+ +
+ +
+
+ +
diff --git a/app/src/app/layout.tsx b/app/src/app/layout.tsx index 1788381..04cba9d 100644 --- a/app/src/app/layout.tsx +++ b/app/src/app/layout.tsx @@ -8,6 +8,14 @@ export const metadata: Metadata = { }, description: "Benchmarking no-tools policy calculation across frontier models.", + icons: { + icon: [ + { + url: "/assets/policyengine-mark.svg", + type: "image/svg+xml", + }, + ], + }, }; export default function RootLayout({ diff --git a/app/src/app/uk/page.tsx b/app/src/app/uk/page.tsx new file mode 100644 index 0000000..c0ce632 --- /dev/null +++ b/app/src/app/uk/page.tsx @@ -0,0 +1,10 @@ +import type { Metadata } from "next"; +import App from "../../App"; + +export const metadata: Metadata = { + title: "United Kingdom", +}; + +export default function UnitedKingdomPage() { + return ; +} diff --git a/app/src/app/us/page.tsx b/app/src/app/us/page.tsx new file mode 100644 index 0000000..7f3751a --- /dev/null +++ b/app/src/app/us/page.tsx @@ -0,0 +1,10 @@ +import type { Metadata } from "next"; +import App from "../../App"; + +export const metadata: Metadata = { + title: "United States", +}; + +export default function UnitedStatesPage() { + return ; +} diff --git a/app/src/components/FailureModes.tsx b/app/src/components/FailureModes.tsx index d15c64e..0f17c36 100644 --- a/app/src/components/FailureModes.tsx +++ b/app/src/components/FailureModes.tsx @@ -1,16 +1,22 @@ +import { useMemo } from "react"; import type { BenchData, FailureModesPayload, - HouseholdFailure, + HeatmapEntry, ProgramFailure, } from "../types"; import { getVariableLabel } from "../types"; +import { getVariableExplainer } from "../variableExplainers"; function formatPct(value?: number | null) { if (value == null || Number.isNaN(value)) return "n/a"; return `${value.toFixed(1)}%`; } +function getHeatmapScore(entry: HeatmapEntry): number { + return entry.score ?? entry.within10pct ?? entry.accuracy ?? 0; +} + function StatLine({ label, value, @@ -74,15 +80,91 @@ function ProgramCard({ ); } -function HouseholdChip({ household }: { household: HouseholdFailure }) { +function ErrorReadPatterns({ + data, + variables, +}: { + data: BenchData; + variables: string[]; +}) { + const averageScores = useMemo(() => { + const valuesByVariable: Record = {}; + for (const entry of data.heatmap) { + if (entry.condition !== "no_tools" || !variables.includes(entry.variable)) { + continue; + } + if (!valuesByVariable[entry.variable]) valuesByVariable[entry.variable] = []; + valuesByVariable[entry.variable].push(getHeatmapScore(entry)); + } + return Object.fromEntries( + Object.entries(valuesByVariable).map(([variable, values]) => [ + variable, + values.reduce((sum, value) => sum + value, 0) / values.length, + ]), + ); + }, [data.heatmap, variables]); + return ( -
-
{household.label}
-
- {household.n.toLocaleString()} scored rows - - {formatPct(household.correctPct)} - +
+
+ What the error reads show +
+

+ These expanders summarize recurring miss patterns from direct reads of + model answers and explanations. They sit here with failure modes because + they describe why the low-scoring program slices break. +

+ +
+ {variables.map((variable) => { + const explainer = getVariableExplainer(data.country, variable); + const avg = averageScores[variable]; + return ( +
+ +
+ + ▸ + +
+
+ {getVariableLabel(variable, data.country)} +
+

+ {explainer?.summary ?? + "This target combines multiple policy rules, and errors usually come from positive cases rather than zero cases."} +

+
+
+ {avg !== undefined && ( +
+ Avg {avg.toFixed(0)}% +
+ )} +
+ +
+
+ Common misses +
+
    + {(explainer?.bullets ?? []).map((bullet) => ( +
  • + + {bullet} +
  • + ))} +
+
+
+ ); + })}
); @@ -92,7 +174,7 @@ export default function FailureModes({ data }: { data: BenchData }) { const country = data.country; const failureModes: FailureModesPayload = data.failureModes; const hardestPrograms = [...failureModes.programs].slice(0, 10); - const hardestHouseholds = [...failureModes.households].slice(0, 7); + const errorReadVariables = hardestPrograms.map((program) => program.variable); return (
@@ -118,7 +200,7 @@ export default function FailureModes({ data }: { data: BenchData }) { style={{ animationDelay: "240ms" }} >
- Read this carefully + How to read these cards

These cards are intentionally stricter than the aggregate leaderboard but @@ -143,19 +225,7 @@ export default function FailureModes({ data }: { data: BenchData }) { ))}

-
-
- Hardest household segments -
-
- {hardestHouseholds.map((household) => ( - - ))} -
-
+
); } diff --git a/app/src/components/Hero.tsx b/app/src/components/Hero.tsx index 32577fa..21fbe9c 100644 --- a/app/src/components/Hero.tsx +++ b/app/src/components/Hero.tsx @@ -16,6 +16,7 @@ export default function Hero({ availableViews, navItems, activeNav, + viewHrefs, }: { selectedView: ViewKey; onSelectView: (view: ViewKey) => void; @@ -24,6 +25,7 @@ export default function Hero({ availableViews: ViewKey[]; navItems: readonly HeaderNavItem[]; activeNav: string; + viewHrefs?: Partial>; }) { const isGlobal = selectedView === "global"; const benchData = isGlobal ? null : (data as BenchData); @@ -72,6 +74,7 @@ export default function Hero({ selectedView={selectedView} onSelectView={onSelectView} availableViews={availableViews} + viewHrefs={viewHrefs} actionLink={{ label: "Paper", href: "/paper", type: "internal" }} /> @@ -81,7 +84,7 @@ export default function Hero({ >
diff --git a/app/src/components/ModelLeaderboard.tsx b/app/src/components/ModelLeaderboard.tsx index 487d340..b1005a1 100644 --- a/app/src/components/ModelLeaderboard.tsx +++ b/app/src/components/ModelLeaderboard.tsx @@ -1,4 +1,4 @@ -import { useMemo, useState } from "react"; +import { useCallback, useMemo, useState } from "react"; import type { BenchData, DashboardBundle, @@ -9,6 +9,12 @@ import type { import { VIEW_SHORT_LABELS, getVariableLabel } from "../types"; import { MODEL_LABELS, MODEL_ORDER, getProviderForModel } from "../modelMeta"; import ProviderMark from "./ProviderMark"; +import ProgramFilterDropdown from "./ProgramFilterDropdown"; +import { + programIsActive, + type ProgramOption, + weightedProgramScore, +} from "../lib/programFilters"; import { SENSITIVITY_VIEWS, modelScoresForView, @@ -104,10 +110,22 @@ export default function ModelLeaderboard({ data, selectedView, dashboard, + programOptions, + activeProgramIds, + activeProgramSummary, + onResetPrograms, + onToggleProgram, + onSelectOnlyProgram, }: { data: BenchData | GlobalBenchData; selectedView: ViewKey; dashboard: DashboardBundle; + programOptions: ProgramOption[]; + activeProgramIds: Set; + activeProgramSummary: string; + onResetPrograms: () => void; + onToggleProgram: (variable: string) => void; + onSelectOnlyProgram: (variable: string) => void; }) { const isGlobal = selectedView === "global"; const [sensitivityView, setSensitivityView] = @@ -127,7 +145,6 @@ export default function ModelLeaderboard({ const [referenceFilter, setReferenceFilter] = useState< "all" | "positives" | "zeros" >("all"); - // Defensive: if a model's payload doesn't include the requested view (stale // data.json), fall back to the canonical Household view so the leaderboard // still has a defensible ranking. @@ -152,6 +169,12 @@ export default function ModelLeaderboard({ return out; }, [sensitivityScores]); + const isProgramActive = useCallback( + (variable: string) => + isGlobal || programIsActive(activeProgramIds, variable), + [activeProgramIds, isGlobal], + ); + // When the user filters to "positives" or "zeros", the heatmap's pre- // aggregated `exact`/`within1pct`/`score` columns aren't usable because // they don't distinguish reference type. Recompute per-(model, variable) @@ -173,6 +196,7 @@ export default function ModelLeaderboard({ >(); for (const varMap of Object.values(country.scenarioPredictions ?? {})) { for (const [variable, modelMap] of Object.entries(varMap)) { + if (!isProgramActive(variable)) continue; for (const [model, pred] of Object.entries(modelMap)) { if (pred.prediction === null) continue; const truth = pred.groundTruth; @@ -219,7 +243,7 @@ export default function ModelLeaderboard({ out.set(model, rates); } return out; - }, [data, isGlobal, referenceFilter]); + }, [data, isGlobal, referenceFilter, isProgramActive]); // Compute weighted hit-rate for one model under the current weighting. We // use this for both "exact" (uses heatmap.exact) and "within 1%" (uses @@ -239,39 +263,31 @@ export default function ModelLeaderboard({ if (referenceFilter !== "all") { // Recomputed per-(model, variable) hit rates: weight by globalWeights. for (const [model, byVar] of filteredHitRates) { - let num = 0; - let den = 0; - for (const [variable, rates] of byVar) { - const w = weights[variable]; - if (w === undefined) continue; - num += w * (rates[field] / 100); - den += w; - } - if (den > 0) out.set(model, (num / den) * 100); + const score = weightedProgramScore( + [...byVar].map(([variable, rates]) => ({ + variable, + value: rates[field], + })), + weights, + ); + if (score !== undefined) out.set(model, score); } return out; } - if (field === "continuous") { - // For "all" + continuous, the precomputed sensitivity score is - // canonical; the caller uses it directly via sensitivityScoreByModel. - return out; - } - - const totals = new Map(); + const ratesByModel = new Map(); for (const entry of country.heatmap ?? []) { if (entry.condition !== "no_tools") continue; - const value = entry[field]; + if (!isProgramActive(entry.variable)) continue; + const value = field === "continuous" ? entry.score : entry[field]; if (value === undefined) continue; - const w = weights[entry.variable]; - if (w === undefined) continue; - const acc = totals.get(entry.model) ?? { num: 0, den: 0 }; - acc.num += w * (value / 100); - acc.den += w; - totals.set(entry.model, acc); + const rates = ratesByModel.get(entry.model) ?? []; + rates.push({ variable: entry.variable, value }); + ratesByModel.set(entry.model, rates); } - for (const [model, { num, den }] of totals) { - if (den > 0) out.set(model, (num / den) * 100); + for (const [model, rates] of ratesByModel) { + const score = weightedProgramScore(rates, weights); + if (score !== undefined) out.set(model, score); } return out; }; @@ -279,19 +295,63 @@ export default function ModelLeaderboard({ const exactScoreByModel = useMemo( () => hitRateByModel("exact"), // eslint-disable-next-line react-hooks/exhaustive-deps - [data, effectiveView, isGlobal, referenceFilter, filteredHitRates], + [ + data, + effectiveView, + isGlobal, + referenceFilter, + filteredHitRates, + activeProgramIds, + ], ); const within1pctScoreByModel = useMemo( () => hitRateByModel("within1pct"), // eslint-disable-next-line react-hooks/exhaustive-deps - [data, effectiveView, isGlobal, referenceFilter, filteredHitRates], + [ + data, + effectiveView, + isGlobal, + referenceFilter, + filteredHitRates, + activeProgramIds, + ], ); const filteredContinuousByModel = useMemo( () => hitRateByModel("continuous"), // eslint-disable-next-line react-hooks/exhaustive-deps - [data, effectiveView, isGlobal, referenceFilter, filteredHitRates], + [ + data, + effectiveView, + isGlobal, + referenceFilter, + filteredHitRates, + activeProgramIds, + ], ); + const selectedMaeByModel = useMemo(() => { + const out = new Map(); + if (isGlobal || !("scenarioPredictions" in data)) return out; + const totals = new Map(); + for (const variableMap of Object.values(data.scenarioPredictions)) { + for (const [variable, modelMap] of Object.entries(variableMap)) { + if (!isProgramActive(variable)) continue; + for (const [model, row] of Object.entries(modelMap)) { + if (row.prediction === null || row.prediction === undefined) continue; + const acc = totals.get(model) ?? { sum: 0, n: 0 }; + acc.sum += Math.abs(row.prediction - row.groundTruth); + acc.n += 1; + totals.set(model, acc); + } + } + } + for (const [model, { sum, n }] of totals) { + if (n > 0) out.set(model, sum / n); + } + return out; + // eslint-disable-next-line react-hooks/exhaustive-deps + }, [data, isGlobal, activeProgramIds]); + const noTools = useMemo(() => { const base = data.modelStats.filter((m) => m.condition === "no_tools"); @@ -309,19 +369,24 @@ export default function ModelLeaderboard({ const w = weighted.get(m.model); const fallback = scoringMode === "exact" ? (m.exact ?? 0) : (m.within1pct ?? 0); - return { ...m, score: w !== undefined ? w : fallback }; + return { + ...m, + score: w !== undefined ? w : fallback, + mae: selectedMaeByModel.get(m.model) ?? m.mae, + }; }) .sort((a, b) => b.score - a.score); } - // Continuous mode. When filtering to positives/zeros we use the - // recomputed weighted score; otherwise fall back to the precomputed - // bounded score (Household weighting) or the sensitivity view. - if (referenceFilter !== "all") { + // Continuous mode. Use the recomputed weighted score for country views so + // program filters renormalize the active program weights; otherwise fall + // back to the precomputed bounded score or global sensitivity view. + if (!isGlobal && filteredContinuousByModel.size > 0) { return [...base] .map((m) => ({ ...m, score: filteredContinuousByModel.get(m.model) ?? 0, + mae: selectedMaeByModel.get(m.model) ?? m.mae, })) .sort((a, b) => b.score - a.score); } @@ -339,10 +404,11 @@ export default function ModelLeaderboard({ effectiveView, sensitivityScoreByModel, filteredContinuousByModel, - referenceFilter, exactScoreByModel, within1pctScoreByModel, + selectedMaeByModel, scoringMode, + isGlobal, ]); const pendingModels = useMemo(() => { @@ -371,7 +437,11 @@ export default function ModelLeaderboard({ const all = new Set(); for (const view of ["household", "aggregate", "equal"] as const) { const map = weights[view]; - if (map) Object.keys(map).forEach((v) => all.add(v)); + if (map) { + Object.keys(map) + .filter(isProgramActive) + .forEach((v) => all.add(v)); + } } const ranked = Array.from(all).map((v) => ({ v, @@ -379,7 +449,7 @@ export default function ModelLeaderboard({ })); ranked.sort((a, b) => b.key - a.key); return ranked.map((r) => r.v); - }, [weights, effectiveView]); + }, [weights, effectiveView, isProgramActive]); return (
@@ -396,8 +466,8 @@ export default function ModelLeaderboard({ style={{ animationDelay: "160ms" }} > {isGlobal - ? "Global scores are equal-weight averages of each model’s US and UK bounded scores. They are not weighted by country population or household count." - : "Country scores give each household equal weight. Each variable's weight is the mean across households of |ref| / max(|household_net_income|, Σ |ref|), renormalized so the global weights sum to one."} + ? "Default ranking compares each model's exact answers across the US and UK benchmark slices." + : "Default ranking compares exact answers across all benchmark households, with one household counting as one household."} {pendingModels.length > 0 && ( <> {" "} @@ -427,201 +497,17 @@ export default function ModelLeaderboard({

-
- - Scoring - -
- {( - [ - [ - "exact", - "Exact", - "Percent of predictions that match the PolicyEngine reference to the dollar (or to the boolean for eligibility flags). Real-world policy decisions need this — close-but-not-right isn't deployable.", - ], - [ - "within1pct", - "Within 1%", - "Percent of predictions within 1% of the reference. The analyst bar — rounding and small rate/parameter drift are tolerated, but material misses are not.", - ], - [ - "continuous", - "Continuous", - "Bounded continuous score: max(0, 1 - |prediction - reference| / |reference|), clipped to [0, 1], reducing to exact-match accuracy for boolean variables. Awards partial credit for close answers; useful for tracking conceptual progress while exact rates remain low.", - ], - ] as const - ).map(([id, label, description]) => { - const isActive = scoringMode === id; - return ( - - ); - })} -
- - {scoringMode === "exact" - ? "Percent matching to the dollar / boolean." - : scoringMode === "within1pct" - ? "Percent within 1% of reference." - : "Bounded score: 1 − |err| / |ref|, clipped to [0, 1]."} - -
- - {!isGlobal && ( -
- - Reference cases - -
- {( - [ - [ - "all", - "All", - "Every (model, scenario, variable) cell in the benchmark slice.", - ], - [ - "positives", - "Positives only", - "Restrict to cases where the PolicyEngine reference is non-zero (e.g., the household actually receives the benefit or owes the tax). Reveals competence on cases that matter, especially on zero-heavy slices like UK.", - ], - [ - "zeros", - "Zeros only", - "Restrict to cases where the PolicyEngine reference is zero (no benefit, no tax). Measures eligibility hedging — does the model correctly say zero when it should?", - ], - ] as const - ).map(([id, label, description]) => { - const isActive = referenceFilter === id; - return ( - - ); - })} -
- - {referenceFilter === "all" - ? "All reference cells." - : referenceFilter === "positives" - ? "Only cases where the reference is nonzero." - : "Only cases where the reference is zero."} - -
- )} - -
- - Weighting - -
- {SENSITIVITY_VIEWS.map((view) => { - const isActive = sensitivityView === view.id; - const supported = - view.id === "household" || - viewSupportsSelected(dashboard, view.id, selectedView); - const disabled = !supported; - const disabledTitleSuffix = " (not available on this slice)"; - return ( - - ); - })} -
- - {activeView.description} - -
- {sensitivityUnsupportedForView && ( -

- The “{ - SENSITIVITY_VIEWS.find((v) => v.id === sensitivityView)?.label ?? - sensitivityView - }” view is not available on this slice; the leaderboard falls - back to the Household view. -

- )} +
+
+ + + + + + + Adjust ranking + + + Default: exact, all cases, household weights + + + + +
+
+ + Scoring + +
+ {( + [ + [ + "exact", + "Exact", + "Percent of predictions that match the PolicyEngine reference to the dollar (or to the boolean for eligibility flags). Real-world policy decisions need this — close-but-not-right isn't deployable.", + ], + [ + "within1pct", + "Within 1%", + "Percent of predictions within 1% of the reference. The analyst bar — rounding and small rate/parameter drift are tolerated, but material misses are not.", + ], + [ + "continuous", + "Continuous", + "Bounded continuous score: max(0, 1 - |prediction - reference| / |reference|), clipped to [0, 1], reducing to exact-match accuracy for boolean variables. Awards partial credit for close answers; useful for tracking conceptual progress while exact rates remain low.", + ], + ] as const + ).map(([id, label, description]) => { + const isActive = scoringMode === id; + return ( + + ); + })} +
+ + {scoringMode === "exact" + ? "Percent matching to the dollar / boolean." + : scoringMode === "within1pct" + ? "Percent within 1% of reference." + : "Bounded score: 1 - |err| / |ref|, clipped to [0, 1]."} + +
+ + {!isGlobal && ( +
+ + Reference cases + +
+ {( + [ + [ + "all", + "All", + "Every (model, scenario, variable) cell in the benchmark slice.", + ], + [ + "positives", + "Positives only", + "Restrict to cases where the PolicyEngine reference is non-zero (e.g., the household actually receives the benefit or owes the tax). Reveals competence on cases that matter, especially on zero-heavy slices like UK.", + ], + [ + "zeros", + "Zeros only", + "Restrict to cases where the PolicyEngine reference is zero (no benefit, no tax). Measures eligibility hedging — does the model correctly say zero when it should?", + ], + ] as const + ).map(([id, label, description]) => { + const isActive = referenceFilter === id; + return ( + + ); + })} +
+ + {referenceFilter === "all" + ? "All reference cells." + : referenceFilter === "positives" + ? "Only cases where the reference is nonzero." + : "Only cases where the reference is zero."} + +
+ )} + +
+ + Weighting + +
+ {SENSITIVITY_VIEWS.map((view) => { + const isActive = sensitivityView === view.id; + const supported = + view.id === "household" || + viewSupportsSelected(dashboard, view.id, selectedView); + const disabled = !supported; + const disabledTitleSuffix = " (not available on this slice)"; + return ( + + ); + })} +
+ + {activeView.description} + +
+ {sensitivityUnsupportedForView && ( +

+ The “{ + SENSITIVITY_VIEWS.find((v) => v.id === sensitivityView)?.label ?? + sensitivityView + }” view is not available on this slice; the leaderboard + falls back to the Household view. +

+ )} +
+
+ {weights && weightedVariables.length > 0 && weightsCountry && (
diff --git a/app/src/components/ProgramFilterDropdown.tsx b/app/src/components/ProgramFilterDropdown.tsx new file mode 100644 index 0000000..ef490ea --- /dev/null +++ b/app/src/components/ProgramFilterDropdown.tsx @@ -0,0 +1,107 @@ +import type { ProgramOption } from "../lib/programFilters"; + +type ProgramFilterDropdownProps = { + options: ProgramOption[]; + activeProgramIds: Set; + summary: string; + description: string; + onReset: () => void; + onToggle: (variable: string) => void; + onSelectOnly: (variable: string) => void; + className?: string; + animationDelay?: string; +}; + +export default function ProgramFilterDropdown({ + options, + activeProgramIds, + summary, + description, + onReset, + onToggle, + onSelectOnly, + className = "", + animationDelay, +}: ProgramFilterDropdownProps) { + if (options.length === 0) return null; + + return ( +
+ + + + + + + Program filter + + {summary} + + + +
+
+

+ {description} +

+ +
+ +
+ {options.map((option) => { + const checked = activeProgramIds.has(option.variable); + return ( +
+ + +
+ ); + })} +
+
+
+ ); +} diff --git a/app/src/components/ProgramHeatmap.tsx b/app/src/components/ProgramHeatmap.tsx index 8b8a12b..1c70b90 100644 --- a/app/src/components/ProgramHeatmap.tsx +++ b/app/src/components/ProgramHeatmap.tsx @@ -6,7 +6,8 @@ import { getPerformanceSurfaceColor, getPerformanceTextColor, } from "../modelMeta"; -import { getVariableExplainer } from "../variableExplainers"; +import { programIsActive, type ProgramOption } from "../lib/programFilters"; +import ProgramFilterDropdown from "./ProgramFilterDropdown"; function getHeatmapScore(entry: HeatmapEntry): number { return entry.score ?? entry.within10pct ?? entry.accuracy ?? 0; @@ -20,7 +21,32 @@ function textColor(pct: number): string { return getPerformanceTextColor(pct); } -export default function ProgramHeatmap({ data }: { data: BenchData }) { +const SCORE_LEGEND = [ + { label: "<50%", score: 45 }, + { label: "50-59%", score: 55 }, + { label: "60-69%", score: 65 }, + { label: "70-79%", score: 75 }, + { label: "80-89%", score: 85 }, + { label: "90%+", score: 95 }, +] as const; + +export default function ProgramHeatmap({ + data, + programOptions, + activeProgramIds, + activeProgramSummary, + onResetPrograms, + onToggleProgram, + onSelectOnlyProgram, +}: { + data: BenchData; + programOptions: ProgramOption[]; + activeProgramIds: Set; + activeProgramSummary: string; + onResetPrograms: () => void; + onToggleProgram: (variable: string) => void; + onSelectOnlyProgram: (variable: string) => void; +}) { const country = data.country; const { grid, variables } = useMemo(() => { // Build lookup: model+variable → bounded score @@ -34,6 +60,7 @@ export default function ProgramHeatmap({ data }: { data: BenchData }) { const varAcc: Record = {}; for (const h of data.heatmap) { if (h.condition !== "no_tools") continue; + if (!programIsActive(activeProgramIds, h.variable)) continue; if (!varAcc[h.variable]) varAcc[h.variable] = []; varAcc[h.variable].push(getHeatmapScore(h)); } @@ -44,7 +71,7 @@ export default function ProgramHeatmap({ data }: { data: BenchData }) { }); return { grid: lookup, variables }; - }, [data]); + }, [activeProgramIds, data]); const models = MODEL_ORDER.filter((m) => data.heatmap.some((h) => h.condition === "no_tools" && h.model === m), @@ -78,8 +105,20 @@ export default function ProgramHeatmap({ data }: { data: BenchData }) { binary coverage flags use exact accuracy.

+ +
@@ -141,110 +180,25 @@ export default function ProgramHeatmap({ data }: { data: BenchData }) {
- {/* Legend */}
Cells use color as a redundant cue; the percentage shown in each cell is the actual benchmark score. -
- - <50% -
-
- - 50–70% -
-
- - 70–80% -
-
- - 90%+ -
-
- -
-
- What the error reads show -
-

- These expanders summarize recurring miss patterns from direct reads of - model answers and explanations, paired with the benchmark scores - above. They are intentionally narrower than the leaderboard summaries: - the goal is to say what the evidence supports, not more. -

- -
- {variables.map((variable) => { - const explainer = getVariableExplainer(country, variable); - const avg = averageScores[variable]; - return ( -
- -
- - ▸ - -
-
- {getVariableLabel(variable, country)} -
-

- {explainer?.summary ?? - "This target combines multiple policy rules, and errors usually come from positive cases rather than zero cases."} -

-
-
-
- Avg {avg.toFixed(0)}% -
-
- -
-
- Common misses -
-
    - {(explainer?.bullets ?? []).map((bullet) => ( -
  • - - {bullet} -
  • - ))} -
-
-
- ); - })} -
+ {SCORE_LEGEND.map(({ label, score }) => ( +
+ + {label} +
+ ))}
); diff --git a/app/src/components/ScenarioExplorer.tsx b/app/src/components/ScenarioExplorer.tsx index 5038fa8..e1eac5c 100644 --- a/app/src/components/ScenarioExplorer.tsx +++ b/app/src/components/ScenarioExplorer.tsx @@ -1,4 +1,10 @@ -import React, { useEffect, useMemo, useRef, useState } from "react"; +import React, { + useCallback, + useEffect, + useMemo, + useRef, + useState, +} from "react"; import { getVariableLabel, isBinaryVariable, @@ -16,7 +22,9 @@ import { isFrontierModel, type ProviderKey, } from "../modelMeta"; +import { programIsActive, type ProgramOption } from "../lib/programFilters"; import ProviderMark from "./ProviderMark"; +import ProgramFilterDropdown from "./ProgramFilterDropdown"; function formatBoolean(value: number): string { return value === 1 ? "Yes" : "No"; @@ -85,8 +93,20 @@ function pickRandomScenario( export default function ScenarioExplorer({ data, + programOptions, + activeProgramIds, + activeProgramSummary, + onResetPrograms, + onToggleProgram, + onSelectOnlyProgram, }: { data: BenchData; + programOptions: ProgramOption[]; + activeProgramIds: Set; + activeProgramSummary: string; + onResetPrograms: () => void; + onToggleProgram: (variable: string) => void; + onSelectOnlyProgram: (variable: string) => void; }) { const country = data.country; const [promptFormat, setPromptFormat] = useState<"tool" | "json">("tool"); @@ -112,7 +132,23 @@ export default function ScenarioExplorer({ [data, resolvedScenarioId], ); - const variables = useMemo(() => Object.keys(predictions).sort(), [predictions]); + const isProgramActive = useCallback( + (variable: string) => programIsActive(activeProgramIds, variable), + [activeProgramIds], + ); + + const variables = useMemo( + () => Object.keys(predictions).filter(isProgramActive).sort(), + [isProgramActive, predictions], + ); + + const filteredPredictions = useMemo(() => { + const out: Record> = {}; + for (const variable of variables) { + out[variable] = predictions[variable] ?? {}; + } + return out; + }, [predictions, variables]); // Frontier-only narrows to one flagship per provider; provider chips // multi-select. The scenario explorer table is wide (one column per model), @@ -124,11 +160,11 @@ export default function ScenarioExplorer({ const allModels = useMemo(() => { const unique = new Set(); - for (const varData of Object.values(predictions)) { + for (const varData of Object.values(filteredPredictions)) { for (const m of Object.keys(varData)) unique.add(m); } return MODEL_ORDER.filter((m) => unique.has(m)); - }, [predictions]); + }, [filteredPredictions]); const models = useMemo(() => { return allModels.filter((m) => { @@ -149,7 +185,9 @@ export default function ScenarioExplorer({ } | null>(null); const selectedCell = - manualSelection && manualSelection.scenarioId === resolvedScenarioId + manualSelection && + manualSelection.scenarioId === resolvedScenarioId && + variables.includes(manualSelection.cell.variable) ? manualSelection.cell : null; @@ -181,19 +219,21 @@ export default function ScenarioExplorer({ const geographyLabel = country === "uk" ? "Region" : "State"; const hasFilingStatus = !!scenario.filingStatus; const currencySymbol = country === "uk" ? "£" : "$"; - const explanationRows = Object.values(predictions).reduce( + const explanationRows = Object.values(filteredPredictions).reduce( (sum, modelMap) => sum + Object.values(modelMap).filter((entry) => !!entry.explanation).length, 0, ); - const annotationRows = Object.values(predictions).reduce( + const annotationRows = Object.values(filteredPredictions).reduce( (sum, modelMap) => sum + Object.values(modelMap).filter((entry) => !!entry.annotation).length, 0, ); - const failureSources = Object.values(predictions).reduce>( + const failureSources = Object.values(filteredPredictions).reduce< + Record + >( (counts, modelMap) => { for (const entry of Object.values(modelMap)) { if (!entry.failureSource) continue; @@ -203,13 +243,13 @@ export default function ScenarioExplorer({ }, {}, ); - const caseAnnotationRows = Object.values(predictions).reduce( + const caseAnnotationRows = Object.values(filteredPredictions).reduce( (sum, modelMap) => sum + Object.values(modelMap).filter((entry) => !!entry.caseAnnotation).length, 0, ); - const totalPredictionRows = Object.values(predictions).reduce( + const totalPredictionRows = Object.values(filteredPredictions).reduce( (sum, modelMap) => sum + Object.keys(modelMap).length, 0, ); @@ -232,9 +272,8 @@ export default function ScenarioExplorer({ className="text-text-secondary mt-3 max-w-2xl leading-relaxed animate-fade-up" style={{ animationDelay: "160ms" }} > - Inspect benchmark households and the exact prompt sent to every model. - Click any prediction cell to see the model's reasoning and our - review of where it went wrong. + Inspect benchmark households, reference outputs, model answers, and the + exact prompt sent to every model.

@@ -320,103 +359,21 @@ export default function ScenarioExplorer({ ))}
- {totalPredictionRows > 0 && ( -
-
- Explanation and audit coverage -
-

- {explanationRows} of {totalPredictionRows} model-output rows for - this household include explanation text returned by the model.{" "} - {annotationRows} rows include developer audit notes for incorrect - predictions, and {caseAnnotationRows} incorrect rows include - case-level notes comparing wrong models on the same - household-output target. Click a prediction cell to read them - below the table. -

- {Object.keys(failureSources).length > 0 && ( -
- {Object.entries(failureSources) - .sort((a, b) => b[1] - a[1]) - .map(([source, count]) => ( - - {formatFailureLabel(source)}: {count} - - ))} -
- )} -
- )} - - {activePrompt && ( -
- -
- - ▸ - -
-
- Exact prompt -
-
- Full household batch contract for all benchmark outputs -
-
-
-
- Provider-specific structured-output transport, no external tool -
-
- -
-
- - -
-
-              {promptFormat === "tool" ? activePrompt.tool : activePrompt.json}
-            
-
-
- )} -
+
+ +
- -- + -- + + + + ); @@ -576,10 +546,10 @@ export default function ScenarioExplorer({ setSelectedCell({ variable: v, model: m }) } aria-pressed={isSelected} - className={`w-full text-right rounded px-1.5 py-0.5 font-[family-name:var(--font-mono)] transition-colors ${ + className={`flex w-full items-center justify-end gap-1.5 rounded-md border px-2 py-1 text-right font-[family-name:var(--font-mono)] shadow-sm transition-colors focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-primary-strong/40 ${ isSelected - ? "bg-primary-soft ring-1 ring-primary-strong/40" - : "hover:bg-surface-soft" + ? "border-primary-strong/50 bg-primary-soft ring-1 ring-primary-strong/40" + : "border-border-subtle bg-card/60 hover:border-primary-strong/40 hover:bg-surface-soft" }`} style={{ color: correct @@ -587,7 +557,20 @@ export default function ScenarioExplorer({ : getPredictionTextColor(predictionError, truth), }} > - {displayPred} + {displayPred} + + + + ); @@ -599,10 +582,102 @@ export default function ScenarioExplorer({
+ {totalPredictionRows > 0 && ( +
+
+ Explanation and audit coverage +
+

+ {explanationRows} of {totalPredictionRows} model-output rows for + this household include explanation text returned by the model.{" "} + {annotationRows} rows include developer audit notes for incorrect + predictions, and {caseAnnotationRows} incorrect rows include + case-level notes comparing wrong models on the same + household-output target. +

+ {Object.keys(failureSources).length > 0 && ( +
+ {Object.entries(failureSources) + .sort((a, b) => b[1] - a[1]) + .map(([source, count]) => ( + + {formatFailureLabel(source)}: {count} + + ))} +
+ )} +
+ )} + + {activePrompt && ( +
+ +
+ + ▸ + +
+
+ Exact prompt +
+
+ Full household batch contract for all benchmark outputs +
+
+
+
+ Provider-specific structured-output transport, no external tool +
+
+ +
+
+ + +
+
+              {promptFormat === "tool" ? activePrompt.tool : activePrompt.json}
+            
+
+
+ )} + void; views: ViewKey[]; + viewHrefs?: Partial>; }) { return (
- {views.map((view) => ( - - ))} + }`; + + if (href) { + return ( + onSelect(view)} + aria-current={selectedView === view ? "page" : undefined} + className={className} + > + {VIEW_LABELS[view]} + + ); + } + + return ( + + ); + })}
); } -function getScrollProgress(threshold: number) { +function getScrollProgress(start: number, distance: number) { if (typeof window === "undefined") return 0; - return Math.min(1, Math.max(0, window.scrollY / threshold)); + return Math.min(1, Math.max(0, (window.scrollY - start) / distance)); } function prefersReducedMotion(): boolean { @@ -61,33 +87,71 @@ function prefersReducedMotion(): boolean { return window.matchMedia("(prefers-reduced-motion: reduce)").matches; } -function useScrollProgress(threshold: number, enabled: boolean) { +function useMobileHeader() { + const [isMobileHeader, setIsMobileHeader] = useState(false); + + useEffect(() => { + if (typeof window === "undefined" || typeof window.matchMedia !== "function") { + return; + } + const media = window.matchMedia("(max-width: 767px)"); + const update = () => setIsMobileHeader(media.matches); + update(); + media.addEventListener("change", update); + return () => media.removeEventListener("change", update); + }, []); + + return isMobileHeader; +} + +function useScrollProgress(start: number, distance: number, enabled: boolean) { const [progress, setProgress] = useState(() => - enabled ? getScrollProgress(threshold) : 0, + enabled ? getScrollProgress(start, distance) : 0, ); const rafRef = useRef(0); useEffect(() => { if (!enabled) return; if (prefersReducedMotion()) { - const snap = () => setProgress(getScrollProgress(threshold) > 0.5 ? 1 : 0); + const snap = () => + setProgress(getScrollProgress(start, distance) > 0.5 ? 1 : 0); snap(); + const timeout = window.setTimeout(snap, 0); window.addEventListener("scroll", snap, { passive: true }); - return () => window.removeEventListener("scroll", snap); + window.addEventListener("resize", snap); + window.addEventListener("hashchange", snap); + window.addEventListener("load", snap); + return () => { + window.clearTimeout(timeout); + window.removeEventListener("scroll", snap); + window.removeEventListener("resize", snap); + window.removeEventListener("hashchange", snap); + window.removeEventListener("load", snap); + }; } const onScroll = () => { cancelAnimationFrame(rafRef.current); rafRef.current = requestAnimationFrame(() => { - setProgress(getScrollProgress(threshold)); + setProgress(getScrollProgress(start, distance)); }); }; onScroll(); + const settleTimeout = window.setTimeout(onScroll, 0); + const hashScrollTimeout = window.setTimeout(onScroll, 120); window.addEventListener("scroll", onScroll, { passive: true }); + window.addEventListener("resize", onScroll); + window.addEventListener("hashchange", onScroll); + window.addEventListener("load", onScroll); return () => { + window.clearTimeout(settleTimeout); + window.clearTimeout(hashScrollTimeout); window.removeEventListener("scroll", onScroll); + window.removeEventListener("resize", onScroll); + window.removeEventListener("hashchange", onScroll); + window.removeEventListener("load", onScroll); cancelAnimationFrame(rafRef.current); }; - }, [threshold, enabled]); + }, [start, distance, enabled]); return enabled ? progress : 0; } @@ -98,6 +162,7 @@ export type SiteHeaderProps = { selectedView?: ViewKey; onSelectView?: (view: ViewKey) => void; availableViews?: ViewKey[]; + viewHrefs?: Partial>; actionLink?: HeaderActionLink; /** * Optional expanded content shown inside the sticky header. Use only with @@ -120,25 +185,39 @@ export default function SiteHeader({ selectedView, onSelectView, availableViews, + viewHrefs, actionLink, expandedContent, alwaysExpanded = false, }: SiteHeaderProps) { - // Background and content opacity are scroll-driven so the sticky bar reveals - // itself as the in-flow hero scrolls away. These properties don't affect - // layout, so they can't trigger the scroll-position feedback loop. - const progress = useScrollProgress(160, !alwaysExpanded); - const bgOpacity = alwaysExpanded ? 1 : progress; - const contentOpacity = alwaysExpanded ? 1 : progress; + // Background and compact content opacity are scroll-driven, but staggered: + // the background covers text moving under the sticky bar, while the compact + // brand/nav wait until the hero intro has cleared. These properties don't + // affect layout, so they can't trigger the scroll-position feedback loop. + const isMobileHeader = useMobileHeader(); + const backgroundProgress = useScrollProgress( + HEADER_BACKGROUND_REVEAL_START, + HEADER_BACKGROUND_REVEAL_DISTANCE, + !alwaysExpanded, + ); + const compactContentProgress = useScrollProgress( + COMPACT_HEADER_REVEAL_START, + COMPACT_HEADER_REVEAL_DISTANCE, + !alwaysExpanded, + ); + const bgOpacity = alwaysExpanded || isMobileHeader ? 1 : backgroundProgress; + const contentOpacity = + alwaysExpanded || isMobileHeader ? 1 : compactContentProgress; const contentVisible = alwaysExpanded || contentOpacity > 0.05; const showViewSelector = availableViews && availableViews.length > 0 && selectedView && onSelectView; + const headerPositionClass = alwaysExpanded ? "relative z-40" : "sticky top-0 z-40"; return ( -
+
)}
0 && (
-
+
{navItems.map((item) => ( )} -
+
{showViewSelector && ( - +
+ +
)} {actionLink && ( -
+
{actionLink.type === "external" ? ( by ): Record { + const entries = Object.entries(weights).filter( + ([variable]) => !isExcludedOutput(variable), + ); + const total = entries.reduce((sum, [, weight]) => sum + weight, 0); + if (total <= 0) return Object.fromEntries(entries); + return Object.fromEntries( + entries.map(([variable, weight]) => [variable, weight / total]), + ); +} + +function filterWeights(weights?: GlobalWeightsByView): GlobalWeightsByView | undefined { + if (!weights) return undefined; + return Object.fromEntries( + WEIGHTING_KEYS.map((key) => [key, normalizeWeights(weights[key] ?? {})]), + ) as GlobalWeightsByView; +} + +function groupWeights(weights: Record): Record { + const grouped: Record = {}; + for (const [variable, weight] of Object.entries(weights)) { + const group = outputGroupForVariable(variable); + grouped[group] = (grouped[group] ?? 0) + weight; + } + return grouped; +} + +function metricHits( + variable: string, + truth: number, + prediction: number | null | undefined, + country: BenchData["country"], +) { + const parsed = + prediction !== null && prediction !== undefined && !Number.isNaN(prediction); + if (!parsed) { + return { parsed: false, exact: 0, within1pct: 0, within5pct: 0, within10pct: 0 }; + } + if (metricTypeForVariable(variable, country) === "binary") { + const exact = Math.round(prediction) === Math.round(truth) ? 1 : 0; + return { parsed, exact, within1pct: exact, within5pct: exact, within10pct: exact }; + } + const absError = Math.abs(prediction - truth); + const exact = absError <= 1 ? 1 : 0; + const within = (tolerance: number) => + truth === 0 ? exact : absError / Math.abs(truth) <= tolerance ? 1 : 0; + return { + parsed, + exact, + within1pct: within(0.01), + within5pct: within(0.05), + within10pct: within(0.1), + }; +} + +function weightedHeatmapScore( + heatmap: HeatmapEntry[], + model: string, + weights: Record, +): number | undefined { + const groupedWeights = groupWeights(weights); + let numerator = 0; + let denominator = 0; + for (const entry of heatmap) { + if (entry.condition !== "no_tools" || entry.model !== model) continue; + const weight = groupedWeights[entry.variable]; + if (weight === undefined) continue; + numerator += weight * (entry.score / 100); + denominator += weight; + } + return denominator > 0 ? (numerator / denominator) * 100 : undefined; +} + +function recomputeModelStats( + payload: BenchData, + scenarioPredictions: Record, + heatmap: HeatmapEntry[], + weights?: GlobalWeightsByView, +): ModelStat[] { + return payload.modelStats.map((stat) => { + if (stat.condition !== "no_tools") return stat; + + let n = 0; + let nParsed = 0; + let exact = 0; + let within1pct = 0; + let within5pct = 0; + let within10pct = 0; + let amountN = 0; + let amountScore = 0; + let participationN = 0; + let participationScore = 0; + let maeN = 0; + let mae = 0; + + for (const variableMap of Object.values(scenarioPredictions)) { + for (const [variable, modelMap] of Object.entries(variableMap)) { + const row = modelMap[stat.model]; + if (!row) continue; + n += 1; + const hits = metricHits( + variable, + row.groundTruth, + row.prediction, + payload.country, + ); + if (hits.parsed) { + nParsed += 1; + mae += Math.abs((row.prediction ?? 0) - row.groundTruth); + maeN += 1; + } + exact += hits.exact; + within1pct += hits.within1pct; + within5pct += hits.within5pct; + within10pct += hits.within10pct; + + if (metricTypeForVariable(variable, payload.country) === "binary") { + participationN += 1; + participationScore += hits.exact; + } else { + amountN += 1; + amountScore += scorePrediction( + variable, + payload.country, + row.groundTruth, + row.prediction, + ); + } + } + } + + const modelHeatmap = heatmap.filter( + (entry) => entry.condition === "no_tools" && entry.model === stat.model, + ); + const outputGroupScore = + modelHeatmap.length > 0 + ? modelHeatmap.reduce((sum, entry) => sum + entry.score, 0) / + modelHeatmap.length + : stat.outputGroupScore; + const boundedScore = + weights?.household && weightedHeatmapScore(heatmap, stat.model, weights.household); + const aggregateScore = + weights?.aggregate && weightedHeatmapScore(heatmap, stat.model, weights.aggregate); + const equalScore = + weights?.equal && weightedHeatmapScore(heatmap, stat.model, weights.equal); + + return { + ...stat, + score: boundedScore ?? stat.score, + outputGroupScore, + exact: n > 0 ? (exact / n) * 100 : stat.exact, + within1pct: n > 0 ? (within1pct / n) * 100 : stat.within1pct, + within5pct: n > 0 ? (within5pct / n) * 100 : stat.within5pct, + within10pct: n > 0 ? (within10pct / n) * 100 : stat.within10pct, + n, + nParsed, + coverage: n > 0 ? (nParsed / n) * 100 : stat.coverage, + mae: maeN > 0 ? mae / maeN : stat.mae, + amountAccuracy: + amountN > 0 ? (amountScore / amountN) * 100 : stat.amountAccuracy, + participationAccuracy: + participationN > 0 + ? (participationScore / participationN) * 100 + : stat.participationAccuracy, + boundedScore: boundedScore ?? stat.boundedScore, + aggregateScore: aggregateScore ?? stat.aggregateScore, + equalScore: equalScore ?? stat.equalScore, + }; + }); +} + +function filterCountryPayload(payload: BenchData): BenchData { + const hasExcludedOutput = + payload.programStats.some((program) => isExcludedOutput(program.variable)) || + payload.heatmap.some((entry) => isExcludedOutput(entry.variable)) || + Object.values(payload.scenarioPredictions).some((variableMap) => + Object.keys(variableMap).some(isExcludedOutput), + ); + if (!hasExcludedOutput) return payload; + + const scenarios = Object.fromEntries( + Object.entries(payload.scenarios).map(([scenarioId, scenario]) => [ + scenarioId, + { + ...scenario, + prompt: scenario.prompt + ? { + tool: scrubPrompt(scenario.prompt.tool), + json: scrubPrompt(scenario.prompt.json), + } + : undefined, + }, + ]), + ); + const scenarioPredictions = Object.fromEntries( + Object.entries(payload.scenarioPredictions).map(([scenarioId, variableMap]) => [ + scenarioId, + Object.fromEntries( + Object.entries(variableMap).filter( + ([variable]) => !isExcludedOutput(variable), + ), + ), + ]), + ); + const heatmap = payload.heatmap.filter( + (entry) => !isExcludedOutput(entry.variable), + ); + const weights = filterWeights(payload.globalWeights); + + return { + ...payload, + scenarios, + scenarioPredictions, + programStats: payload.programStats.filter( + (program) => !isExcludedOutput(program.variable), + ), + heatmap, + globalWeights: weights, + failureModes: { + ...payload.failureModes, + programs: payload.failureModes.programs.filter( + (program) => !isExcludedOutput(program.variable), + ), + }, + modelStats: recomputeModelStats(payload, scenarioPredictions, heatmap, weights), + }; +} + +export function filterExcludedOutputs( + dashboard: DashboardBundle, +): DashboardBundle { + return { + ...dashboard, + countries: Object.fromEntries( + Object.entries(dashboard.countries).map(([country, payload]) => [ + country, + payload ? filterCountryPayload(payload) : payload, + ]), + ), + }; +} diff --git a/app/src/lib/programFilters.ts b/app/src/lib/programFilters.ts new file mode 100644 index 0000000..23cfdeb --- /dev/null +++ b/app/src/lib/programFilters.ts @@ -0,0 +1,132 @@ +import { getVariableLabel, type BenchData } from "../types"; +import { outputGroupForVariable } from "./scoring"; + +export type ProgramOption = { + variable: string; + label: string; +}; + +export type ProgramRate = { + variable: string; + value: number | undefined; +}; + +export function buildProgramOptions(data: BenchData): ProgramOption[] { + const variables = new Set(); + for (const entry of data.heatmap) { + if (entry.condition === "no_tools") { + variables.add(outputGroupForVariable(entry.variable)); + } + } + return Array.from(variables) + .map((variable) => ({ + variable, + label: getVariableLabel(variable, data.country), + })) + .sort((a, b) => a.label.localeCompare(b.label)); +} + +export function resolveActiveProgramIds( + programOptionIds: readonly string[], + selectedPrograms: Set, +): Set { + const all = new Set(programOptionIds); + if (selectedPrograms.size === 0) return all; + const active = new Set( + [...selectedPrograms] + .map(outputGroupForVariable) + .filter((variable) => all.has(variable)), + ); + return active.size > 0 ? active : all; +} + +export function toggleProgramSelection( + programOptionIds: readonly string[], + selectedPrograms: Set, + variable: string, +): Set { + const allowed = new Set(programOptionIds); + const normalized = new Set( + [...selectedPrograms] + .map(outputGroupForVariable) + .filter((value) => allowed.has(value)), + ); + const next = + selectedPrograms.size === 0 || normalized.size === 0 + ? new Set(programOptionIds) + : normalized; + const groupedVariable = outputGroupForVariable(variable); + + if (next.has(groupedVariable)) { + if (next.size === 1) return next; + next.delete(groupedVariable); + } else { + next.add(groupedVariable); + } + + return next.size === programOptionIds.length ? new Set() : next; +} + +export function selectOnlyProgram(variable: string): Set { + return new Set([outputGroupForVariable(variable)]); +} + +export function programIsActive( + activeProgramIds: Set, + variable: string, +): boolean { + return activeProgramIds.has(outputGroupForVariable(variable)); +} + +export function groupWeights( + weights: Record, +): Record { + const grouped: Record = {}; + for (const [variable, weight] of Object.entries(weights)) { + const group = outputGroupForVariable(variable); + grouped[group] = (grouped[group] ?? 0) + weight; + } + return grouped; +} + +export function weightForProgram( + weights: Record, + variable: string, +): number | undefined { + return groupWeights(weights)[outputGroupForVariable(variable)]; +} + +function groupRates(rates: Iterable): ProgramRate[] { + const grouped = new Map(); + for (const { variable, value } of rates) { + if (value === undefined) continue; + const group = outputGroupForVariable(variable); + const acc = grouped.get(group) ?? { sum: 0, n: 0 }; + acc.sum += value; + acc.n += 1; + grouped.set(group, acc); + } + return [...grouped.entries()].map(([variable, { sum, n }]) => ({ + variable, + value: sum / n, + })); +} + +export function weightedProgramScore( + rates: Iterable, + weights: Record, +): number | undefined { + const groupedWeights = groupWeights(weights); + let numerator = 0; + let denominator = 0; + + for (const { variable, value } of groupRates(rates)) { + if (value === undefined) continue; + const weight = groupedWeights[variable]; + if (weight === undefined) continue; + numerator += weight * (value / 100); + denominator += weight; + } + + return denominator > 0 ? (numerator / denominator) * 100 : undefined; +} diff --git a/app/src/types.ts b/app/src/types.ts index c2f7153..4024bf5 100644 --- a/app/src/types.ts +++ b/app/src/types.ts @@ -27,7 +27,6 @@ const US_VARIABLE_LABELS: Record = { snap: "SNAP", ssi: "SSI", tanf: "TANF", - premium_tax_credit: "ACA Premium Tax Credit", free_school_meals_eligible: "Free school meals eligibility", reduced_price_school_meals_eligible: "Reduced-price school meals eligibility", @@ -64,7 +63,6 @@ const US_VARIABLE_CATEGORIES: Record = { snap: "Benefits", ssi: "Benefits", tanf: "Benefits", - premium_tax_credit: "Health", free_school_meals_eligible: "Coverage", reduced_price_school_meals_eligible: "Coverage", person_wic_eligible: "Coverage", diff --git a/app/src/variableExplainers.ts b/app/src/variableExplainers.ts index 4d839c8..765d8e7 100644 --- a/app/src/variableExplainers.ts +++ b/app/src/variableExplainers.ts @@ -19,15 +19,7 @@ const US_EXPLAINERS: Record = { "This target captures the refundable federal credit side of the income-tax calculation.", bullets: [ "It includes EITC and refundable portions of credits such as refundable CTC when applicable.", - "It excludes the ACA Premium Tax Credit, which is outside the benchmark federal income-tax target.", - ], - }, - premium_tax_credit: { - summary: - "This target captures ACA Marketplace premium assistance as a health-related resource, separate from federal income-tax credits.", - bullets: [ - "It depends on Marketplace eligibility, disqualifying health coverage such as affordable employer coverage, ACA MAGI, and the local second-lowest-cost silver plan premium.", - "Marketplace plan facts are phrased as selected-plan information a household might know, while the local benchmark premium usually still has to be estimated.", + "It keeps refundable income-tax credits separate from the nonrefundable-credit target.", ], }, snap: { diff --git a/app/tests/programFilters.test.ts b/app/tests/programFilters.test.ts new file mode 100644 index 0000000..791ab9a --- /dev/null +++ b/app/tests/programFilters.test.ts @@ -0,0 +1,81 @@ +import assert from "node:assert/strict"; +import test from "node:test"; +import { + resolveActiveProgramIds, + selectOnlyProgram, + toggleProgramSelection, + weightForProgram, + weightedProgramScore, +} from "../src/lib/programFilters"; + +function assertClose(actual: number | undefined, expected: number): void { + assert.notEqual(actual, undefined); + assert.ok(Math.abs(actual - expected) < 1e-9); +} + +test("weightedProgramScore rescales weights over selected programs", () => { + const weights = { + federal_income_tax_before_refundable_credits: 0.9, + snap: 0.1, + }; + + assertClose( + weightedProgramScore( + [ + { variable: "federal_income_tax_before_refundable_credits", value: 80 }, + { variable: "snap", value: 20 }, + ], + weights, + ), + 74, + ); + assertClose( + weightedProgramScore( + [{ variable: "federal_income_tax_before_refundable_credits", value: 80 }], + weights, + ), + 80, + ); +}); + +test("weightedProgramScore groups person-level eligibility outputs", () => { + const weights = { + person_wic_eligible: 0.4, + snap: 0.6, + }; + + assert.equal(weightForProgram(weights, "head_wic_eligible"), 0.4); + assertClose( + weightedProgramScore( + [ + { variable: "head_wic_eligible", value: 0 }, + { variable: "spouse_wic_eligible", value: 100 }, + { variable: "snap", value: 50 }, + ], + weights, + ), + 50, + ); +}); + +test("program selection normalizes groups and keeps at least one active", () => { + const options = ["person_wic_eligible", "snap"]; + + assert.deepEqual( + [...resolveActiveProgramIds(options, new Set(["head_wic_eligible"]))], + ["person_wic_eligible"], + ); + assert.deepEqual([...selectOnlyProgram("spouse_wic_eligible")], [ + "person_wic_eligible", + ]); + assert.deepEqual( + [ + ...toggleProgramSelection( + options, + new Set(["person_wic_eligible"]), + "head_wic_eligible", + ), + ], + ["person_wic_eligible"], + ); +}); diff --git a/paper/index.qmd b/paper/index.qmd index 6abf3ea..01dfca4 100644 --- a/paper/index.qmd +++ b/paper/index.qmd @@ -934,7 +934,7 @@ snapshot_provenance = pd.DataFrame( ["UK scenario-source SHA-256", UK_TRANSFER_ARTIFACT_SHA256], ["Households", "100 US and 100 UK"], ["Models", f"{len(global_model)} shared models"], - ["Output groups", "19 US and 7 UK"], + ["Output groups", "18 US and 7 UK"], ["Condition", "No tools, no web access, one structured response per household"], ["Response contract", "Numeric answer and non-empty explanation for every requested output"], ], @@ -944,7 +944,6 @@ scope_rationale = pd.DataFrame( [ ["Included", "Direct tax, credit, benefit, health-support, and coverage outputs that a household-facing model could plausibly be asked to estimate from household facts."], ["Excluded", "Intermediate tax bases, payroll subcomponents, and outputs that mainly require unavailable history, restricted local market data, restricted program-administration data, or take-up assignment rather than rule calculation."], - ["ACA Premium Tax Credit", "Retained as a deliberate health-support output; when local benchmark premiums are not listed, the model must estimate them from the household facts."], ["Binary coverage outputs", "Requested as 0/1 eligibility flags and scored as classification tasks; their dollar values are used only as impact-weight proxies, not as requested model outputs."], ["WIC", "The benchmark asks for person-level WIC eligibility. It does not ask models to estimate a WIC dollar amount."], ], @@ -967,7 +966,7 @@ parse_repair_summary = pd.DataFrame( ["Parser repair", "The parser was extended to recover explicit `value` and non-empty `explanation` blocks from nested, escaped, or partially truncated provider JSON without scraping prose numbers."], ["Full-response retries", "Three bounded retry rounds targeted broken country-model-household responses and accepted only fully valid replacement responses."], ["Row-level repairs", "A final repair pass retried only rows still missing a parsed numeric value or non-empty explanation, using the same model, household, and output."], - ["Final parse coverage", "The repaired manuscript snapshot has zero missing parsed numeric values and zero missing explanations across all 34,656 model-output rows."], + ["Final parse coverage", "The repaired manuscript snapshot has zero missing parsed numeric values and zero missing explanations across all 33,456 model-output rows."], ["Preservation rule", "The snapshot retains response-retry and row-repair targets, attempts, accepted replacements, rejected rows, and merged prediction files."], ], columns=["Step", "Finding"], @@ -1086,7 +1085,7 @@ model_runs The US benchmark is built from Enhanced Current Population Survey (CPS)-derived households using PolicyEngine US. The sampled households are filtered to keep a single-tax-unit, single-family, single-Supplemental Poverty Measure (SPM)-unit structure with at least one adult and a supported filing status. The 2024 Enhanced CPS source contains 41,314 households; 30,173 (73.0%) pass the filter and form the eligible draw. The 27.0% excluded by the filter include multi-tax-unit households (e.g., adult roommates), multi-family households, multi-SPM-unit households, and households whose head reports a filing status outside the supported set. These excluded compositions are exactly the kind of cases where federal/state credit allocations and benefit-unit rules become hardest, so the eligible draw is a tractable subset rather than the full distribution of US households. Prompts include nonzero promptable raw inputs across relevant entities rather than a hand-curated summary, so the models see many of the same facts the simulator receives. Filing status is not stated in the prompt; the reference computation infers it from tax-unit role flags. Models therefore see the same household facts that drive the reference filing-status assignment, but they do not receive that assignment as a label. -The current US release evaluates 19 output groups spanning federal income tax, refundable credits, payroll and self-employment tax, state and local income tax, Supplemental Nutrition Assistance Program (SNAP), Supplemental Security Income (SSI), Temporary Assistance for Needy Families (TANF), Affordable Care Act (ACA) premium tax credits, school-meal eligibility, and person-level coverage eligibility for the Special Supplemental Nutrition Program for Women, Infants, and Children (WIC), Medicaid, the Children's Health Insurance Program (CHIP), Medicare, Head Start, and Early Head Start. +The current US release evaluates 18 output groups spanning federal income tax, refundable credits, payroll and self-employment tax, state and local income tax, Supplemental Nutrition Assistance Program (SNAP), Supplemental Security Income (SSI), Temporary Assistance for Needy Families (TANF), school-meal eligibility, and person-level coverage eligibility for the Special Supplemental Nutrition Program for Women, Infants, and Children (WIC), Medicaid, the Children's Health Insurance Program (CHIP), Medicare, Head Start, and Early Head Start. The output scope is intentionally narrower than the full PolicyEngine model. @tbl-scope-rationale summarizes the inclusion rule. The benchmark asks for WIC eligibility rather than a WIC dollar amount; WIC dollar values are used only as impact-weight proxies for coverage flags, not as requested model outputs. diff --git a/paper/snapshot/20260501/manifest.json b/paper/snapshot/20260501/manifest.json index 1b4d441..3e9794d 100644 --- a/paper/snapshot/20260501/manifest.json +++ b/paper/snapshot/20260501/manifest.json @@ -66,7 +66,7 @@ "files": { "figures/global_leaderboard.png": "526511477880573c4e7cae98e3d966eb400eecf7e44c807ddcd4a5d377ebcd6f", "figures/positive_zero_scatter.png": "b15332fdda92c8f23269937c90968fb50327186fb154bc29729264586dc463d5", - "index.html": "cd72096f22179ed8cc919b5d61ea8441488bd11cc9e8bccc06d44a874aac6196", + "index.html": "1a45bc2d3d15c3592106fbaa2b5eeea4c2e23ceacf348c464e49387f8bb2b8b9", "pe-tokens.css": "8f24d8da26f583c8ffddffcdcd172b6d52cbecfec20eda55bd39d7aa829f41d8", "policybench-theme.css": "0e12c5fd615558259e5bce0167a38424e54f9ceb280666c4afd660d759cd1cb9", "site_libs/clipboard/clipboard.min.js": "e17a1d816e13c0826e0ed7febfabc3277f45571234bde0bf9120829a7169edc9",