Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 11 additions & 10 deletions policybench/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -801,9 +801,7 @@ def _match(row):
pred_flag = binary_flag(pred)
ref_flag = binary_flag(ref)
return (
pred_flag is not None
and ref_flag is not None
and pred_flag == ref_flag
pred_flag is not None and ref_flag is not None and pred_flag == ref_flag
)
return (float(ref) == 0.0) == (float(pred) == 0.0)

Expand Down Expand Up @@ -1245,14 +1243,17 @@ def compute_metrics(
row["threshold_score"] = float(
np.mean([exact, within_1pct, within_5pct, within_10pct])
)
row["score"] = float(
np.mean(
[
bounded_row_score(variable, truth, pred)
for truth, pred in zip(y_true, y_pred, strict=True)
]
row["score"] = (
float(
np.mean(
[
bounded_row_score(variable, truth, pred)
for truth, pred in zip(y_true, y_pred, strict=True)
]
)
)
) * coverage
* coverage
)

rows.append(row)

Expand Down
2 changes: 1 addition & 1 deletion policybench/benchmark_specs.json
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@
"id": "premium_tax_credit",
"pe_variable": "premium_tax_credit",
"label": "ACA Premium Tax Credit",
"prompt": "annual ACA Premium Tax Credit amount for Marketplace health insurance premium assistance. Use any listed Marketplace plan information as what the household knows about the plan they selected. Estimate any needed local Marketplace benchmark premium from the household facts if it is not provided. If no selected Marketplace plan information is listed, assume the selected plan costs about the same as the local benchmark Silver plan. Return 0 if the household is ineligible or does not receive Marketplace premium assistance",
"prompt": "annual ACA Premium Tax Credit amount for Marketplace health insurance premium assistance. For this output, assume Marketplace enrollment and credit take-up if the household is ACA-eligible; do not return 0 merely because Marketplace enrollment, the local benchmark premium, or a selected-plan premium is not listed. Use any listed Marketplace plan information as what the household knows about the plan they selected. Estimate any needed local Marketplace benchmark premium from the household facts if it is not provided. If no selected Marketplace plan information is listed, assume the selected plan costs about the same as the local benchmark Silver plan. Return 0 only if the household is ineligible for Marketplace premium assistance under ACA rules or listed facts show disqualifying coverage",
"metric_type": "amount",
"role": "health",
"output_set": "excluded_prompt_issue",
Expand Down
5 changes: 5 additions & 0 deletions policybench/eval_no_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@ def _env_int(name: str, default: int) -> int:
GEMINI_PRO_REQUEST_TIMEOUT_SECONDS = _env_int(
"POLICYBENCH_GEMINI_PRO_REQUEST_TIMEOUT_SECONDS", 60
)
CLAUDE_REQUEST_TIMEOUT_SECONDS = _env_int(
"POLICYBENCH_CLAUDE_REQUEST_TIMEOUT_SECONDS", 90
)
XAI_REQUEST_TIMEOUT_SECONDS = _env_int("POLICYBENCH_XAI_REQUEST_TIMEOUT_SECONDS", 420)
REQUEST_WALL_TIMEOUT_GRACE_SECONDS = 30
REQUEST_WALL_TIMEOUT_MULTIPLIER = 1.5
Expand Down Expand Up @@ -365,6 +368,8 @@ def _request_timeout_seconds(model_id: str) -> int:
return GEMINI_PRO_REQUEST_TIMEOUT_SECONDS
if model_id == "gpt-5.5":
return GEMINI_PRO_REQUEST_TIMEOUT_SECONDS
if model_id.startswith("claude-"):
return CLAUDE_REQUEST_TIMEOUT_SECONDS
if model_id.startswith("xai/"):
return XAI_REQUEST_TIMEOUT_SECONDS
return REQUEST_TIMEOUT_SECONDS
Expand Down
28 changes: 28 additions & 0 deletions policybench/scenarios.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@
"employer_quarterly_payroll_expense_override",
"employer_state_unemployment_tax_rate_override",
"has_itin",
"has_champva_health_coverage_at_interview",
"has_marketplace_health_coverage",
"has_marketplace_health_coverage_at_interview",
"has_medicaid_health_coverage_at_interview",
Expand Down Expand Up @@ -220,6 +221,31 @@
"long_term_capital_gains",
}

# These fields represent costs, premiums, balances, or expenses that cannot be
# negative in ordinary household facts. Some survey/imputation artifacts encode
# negative values; omit those rather than asking models to reason from impossible
# prompt facts. Loss-like income fields, such as rental or partnership income,
# remain promptable when negative.
NONNEGATIVE_PROMPT_INPUTS = {
"auto_loan_balance",
"auto_loan_interest",
"charitable_cash_donations",
"charitable_non_cash_donations",
"deductible_mortgage_interest",
"employer_sponsored_insurance_premiums",
"first_home_mortgage_balance",
"first_home_mortgage_interest",
"health_insurance_premiums_without_medicare_part_b",
"home_mortgage_interest",
"household_vehicles_value",
"medicare_part_b_premiums",
"other_health_insurance_premiums",
"other_medical_expenses",
"over_the_counter_health_expenses",
"pre_subsidy_rent",
"real_estate_taxes",
}

BASE_CPS_COLUMNS = {
"person_id": "person_id",
"household_id": "household_id",
Expand Down Expand Up @@ -1056,6 +1082,8 @@ def _extract_entity_inputs(
continue

value = float(row[spec.output_name])
if spec.output_name in NONNEGATIVE_PROMPT_INPUTS and value < 0:
continue
if (
spec.default_value is not None
and abs(value - float(spec.default_value)) <= 1e-6
Expand Down
4 changes: 1 addition & 3 deletions tests/test_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -764,9 +764,7 @@ def test_build_dashboard_payload_matches_frontend_shape(self):
assert "within10pctRunMean" not in payload["modelStats"][0]
assert payload["heatmap"][0]["condition"] == "no_tools"
income_program = next(
row
for row in payload["programStats"]
if row["variable"] == "income_tax"
row for row in payload["programStats"] if row["variable"] == "income_tax"
)
assert income_program["score"] == pytest.approx(92.5)
assert income_program["thresholdScore"] == pytest.approx(37.5)
Expand Down
10 changes: 9 additions & 1 deletion tests/test_eval_no_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -646,10 +646,12 @@ def test_premium_tax_credit_prompt_places_ptc_in_health(mini_scenario):
prompt_lower = prompt.lower()
assert "aca premium tax credit" in prompt_lower
assert "marketplace health insurance premium assistance" in prompt_lower
assert "assume marketplace enrollment and credit take-up" in prompt_lower
assert "do not return 0 merely because marketplace enrollment" in prompt_lower
assert "what the household knows about the plan they selected" in prompt_lower
assert "benchmark premium" in prompt_lower
assert "selected plan costs about the same" in prompt_lower
assert "return 0 if" in prompt_lower
assert "return 0 only if" in prompt_lower


def test_compact_tax_breakdown_prompts_are_explicit(mini_scenario):
Expand Down Expand Up @@ -1495,6 +1497,12 @@ def test_gpt_55_uses_longer_full_output_timeout():
assert _request_timeout_seconds("gpt-5.5") == 60


def test_claude_models_use_longer_timeout():
"""Claude explanation calls can exceed the generic 20s request timeout."""
assert _request_timeout_seconds("claude-sonnet-4.6") == 90
assert _request_timeout_seconds("claude-opus-4.7") == 90


def test_xai_models_use_longer_timeout():
"""xAI models need more than the generic 20s request timeout."""
assert _request_timeout_seconds("xai/grok-4.3") == 420
Expand Down
35 changes: 35 additions & 0 deletions tests/test_scenarios.py
Original file line number Diff line number Diff line change
Expand Up @@ -661,6 +661,41 @@ def test_aggregate_net_worth_input_is_not_preserved():
assert "net_worth" not in scenario.household_inputs


def test_impossible_negative_cost_inputs_are_not_preserved():
"""Negative cost/premium artifacts should not appear as household facts."""
scenario = scenarios_from_cps_frame(
pd.DataFrame(
[
{
"person_id": 1,
"household_id": 1,
"tax_unit_id": 1,
"spm_unit_id": 1,
"family_id": 1,
"household_weight": 1.0,
"state_code": "PA",
"filing_status": "SINGLE",
"age": 35,
"employment_income": 50_000.0,
"has_champva_health_coverage_at_interview": True,
"other_health_insurance_premiums": -5_000.0,
"other_medical_expenses": -100.0,
"partnership_s_corp_income": -2_000.0,
"is_tax_unit_head": True,
}
]
),
n=1,
seed=0,
)[0]

head = scenario.adults[0].inputs
assert "has_champva_health_coverage_at_interview" not in head
assert "other_health_insurance_premiums" not in head
assert "other_medical_expenses" not in head
assert head["partnership_s_corp_income"] == -2_000.0


def test_formula_overtime_premium_is_not_prompted_or_sent_to_policyengine():
scenario = scenarios_from_cps_frame(
pd.DataFrame(
Expand Down
9 changes: 4 additions & 5 deletions tests/test_snapshot_artifacts.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,16 +149,15 @@ def _aggregate_scenario_metric(country_payload: dict, metric: str) -> dict[str,
household_score = 0.0
for variable, model_map in variables:
row = model_map[model]
household_score += (
raw_row_weights[variable] / denominator
) * row[metric]
household_score += (raw_row_weights[variable] / denominator) * row[
metric
]
entry = totals.setdefault(model, {"score": 0.0, "households": 0.0})
entry["score"] += household_score
entry["households"] += 1

return {
model: entry["score"] / entry["households"]
for model, entry in totals.items()
model: entry["score"] / entry["households"] for model, entry in totals.items()
}


Expand Down