PolicyEngine · MaxGhenis · May 25, 2026
diff --git a/policybench/analysis.py b/policybench/analysis.py
@@ -801,9 +801,7 @@ def _match(row):
             pred_flag = binary_flag(pred)
             ref_flag = binary_flag(ref)
             return (
-                pred_flag is not None
-                and ref_flag is not None
-                and pred_flag == ref_flag
+                pred_flag is not None and ref_flag is not None and pred_flag == ref_flag
             )
         return (float(ref) == 0.0) == (float(pred) == 0.0)
 
@@ -1245,14 +1243,17 @@ def compute_metrics(
             row["threshold_score"] = float(
                 np.mean([exact, within_1pct, within_5pct, within_10pct])
             )
-            row["score"] = float(
-                np.mean(
-                    [
-                        bounded_row_score(variable, truth, pred)
-                        for truth, pred in zip(y_true, y_pred, strict=True)
-                    ]
+            row["score"] = (
+                float(
+                    np.mean(
+                        [
+                            bounded_row_score(variable, truth, pred)
+                            for truth, pred in zip(y_true, y_pred, strict=True)
+                        ]
+                    )
                 )
-            ) * coverage
+                * coverage
+            )
 
         rows.append(row)
 

diff --git a/policybench/benchmark_specs.json b/policybench/benchmark_specs.json
@@ -119,7 +119,7 @@
                         "id": "premium_tax_credit",
                         "pe_variable": "premium_tax_credit",
                         "label": "ACA Premium Tax Credit",
-                        "prompt": "annual ACA Premium Tax Credit amount for Marketplace health insurance premium assistance. Use any listed Marketplace plan information as what the household knows about the plan they selected. Estimate any needed local Marketplace benchmark premium from the household facts if it is not provided. If no selected Marketplace plan information is listed, assume the selected plan costs about the same as the local benchmark Silver plan. Return 0 if the household is ineligible or does not receive Marketplace premium assistance",
+                        "prompt": "annual ACA Premium Tax Credit amount for Marketplace health insurance premium assistance. For this output, assume Marketplace enrollment and credit take-up if the household is ACA-eligible; do not return 0 merely because Marketplace enrollment, the local benchmark premium, or a selected-plan premium is not listed. Use any listed Marketplace plan information as what the household knows about the plan they selected. Estimate any needed local Marketplace benchmark premium from the household facts if it is not provided. If no selected Marketplace plan information is listed, assume the selected plan costs about the same as the local benchmark Silver plan. Return 0 only if the household is ineligible for Marketplace premium assistance under ACA rules or listed facts show disqualifying coverage",
                         "metric_type": "amount",
                         "role": "health",
                         "output_set": "excluded_prompt_issue",

diff --git a/policybench/eval_no_tools.py b/policybench/eval_no_tools.py
@@ -42,6 +42,9 @@ def _env_int(name: str, default: int) -> int:
 GEMINI_PRO_REQUEST_TIMEOUT_SECONDS = _env_int(
     "POLICYBENCH_GEMINI_PRO_REQUEST_TIMEOUT_SECONDS", 60
 )
+CLAUDE_REQUEST_TIMEOUT_SECONDS = _env_int(
+    "POLICYBENCH_CLAUDE_REQUEST_TIMEOUT_SECONDS", 90
+)
 XAI_REQUEST_TIMEOUT_SECONDS = _env_int("POLICYBENCH_XAI_REQUEST_TIMEOUT_SECONDS", 420)
 REQUEST_WALL_TIMEOUT_GRACE_SECONDS = 30
 REQUEST_WALL_TIMEOUT_MULTIPLIER = 1.5
@@ -365,6 +368,8 @@ def _request_timeout_seconds(model_id: str) -> int:
         return GEMINI_PRO_REQUEST_TIMEOUT_SECONDS
     if model_id == "gpt-5.5":
         return GEMINI_PRO_REQUEST_TIMEOUT_SECONDS
+    if model_id.startswith("claude-"):
+        return CLAUDE_REQUEST_TIMEOUT_SECONDS
     if model_id.startswith("xai/"):
         return XAI_REQUEST_TIMEOUT_SECONDS
     return REQUEST_TIMEOUT_SECONDS

diff --git a/policybench/scenarios.py b/policybench/scenarios.py
@@ -90,6 +90,7 @@
     "employer_quarterly_payroll_expense_override",
     "employer_state_unemployment_tax_rate_override",
     "has_itin",
+    "has_champva_health_coverage_at_interview",
     "has_marketplace_health_coverage",
     "has_marketplace_health_coverage_at_interview",
     "has_medicaid_health_coverage_at_interview",
@@ -220,6 +221,31 @@
     "long_term_capital_gains",
 }
 
+# These fields represent costs, premiums, balances, or expenses that cannot be
+# negative in ordinary household facts. Some survey/imputation artifacts encode
+# negative values; omit those rather than asking models to reason from impossible
+# prompt facts. Loss-like income fields, such as rental or partnership income,
+# remain promptable when negative.
+NONNEGATIVE_PROMPT_INPUTS = {
+    "auto_loan_balance",
+    "auto_loan_interest",
+    "charitable_cash_donations",
+    "charitable_non_cash_donations",
+    "deductible_mortgage_interest",
+    "employer_sponsored_insurance_premiums",
+    "first_home_mortgage_balance",
+    "first_home_mortgage_interest",
+    "health_insurance_premiums_without_medicare_part_b",
+    "home_mortgage_interest",
+    "household_vehicles_value",
+    "medicare_part_b_premiums",
+    "other_health_insurance_premiums",
+    "other_medical_expenses",
+    "over_the_counter_health_expenses",
+    "pre_subsidy_rent",
+    "real_estate_taxes",
+}
+
 BASE_CPS_COLUMNS = {
     "person_id": "person_id",
     "household_id": "household_id",
@@ -1056,6 +1082,8 @@ def _extract_entity_inputs(
             continue
 
         value = float(row[spec.output_name])
+        if spec.output_name in NONNEGATIVE_PROMPT_INPUTS and value < 0:
+            continue
         if (
             spec.default_value is not None
             and abs(value - float(spec.default_value)) <= 1e-6

diff --git a/tests/test_analysis.py b/tests/test_analysis.py
@@ -764,9 +764,7 @@ def test_build_dashboard_payload_matches_frontend_shape(self):
         assert "within10pctRunMean" not in payload["modelStats"][0]
         assert payload["heatmap"][0]["condition"] == "no_tools"
         income_program = next(
-            row
-            for row in payload["programStats"]
-            if row["variable"] == "income_tax"
+            row for row in payload["programStats"] if row["variable"] == "income_tax"
         )
         assert income_program["score"] == pytest.approx(92.5)
         assert income_program["thresholdScore"] == pytest.approx(37.5)

diff --git a/tests/test_eval_no_tools.py b/tests/test_eval_no_tools.py
@@ -646,10 +646,12 @@ def test_premium_tax_credit_prompt_places_ptc_in_health(mini_scenario):
     prompt_lower = prompt.lower()
     assert "aca premium tax credit" in prompt_lower
     assert "marketplace health insurance premium assistance" in prompt_lower
+    assert "assume marketplace enrollment and credit take-up" in prompt_lower
+    assert "do not return 0 merely because marketplace enrollment" in prompt_lower
     assert "what the household knows about the plan they selected" in prompt_lower
     assert "benchmark premium" in prompt_lower
     assert "selected plan costs about the same" in prompt_lower
-    assert "return 0 if" in prompt_lower
+    assert "return 0 only if" in prompt_lower
 
 
 def test_compact_tax_breakdown_prompts_are_explicit(mini_scenario):
@@ -1495,6 +1497,12 @@ def test_gpt_55_uses_longer_full_output_timeout():
     assert _request_timeout_seconds("gpt-5.5") == 60
 
 
+def test_claude_models_use_longer_timeout():
+    """Claude explanation calls can exceed the generic 20s request timeout."""
+    assert _request_timeout_seconds("claude-sonnet-4.6") == 90
+    assert _request_timeout_seconds("claude-opus-4.7") == 90
+
+
 def test_xai_models_use_longer_timeout():
     """xAI models need more than the generic 20s request timeout."""
     assert _request_timeout_seconds("xai/grok-4.3") == 420

diff --git a/tests/test_scenarios.py b/tests/test_scenarios.py
@@ -661,6 +661,41 @@ def test_aggregate_net_worth_input_is_not_preserved():
     assert "net_worth" not in scenario.household_inputs
 
 
+def test_impossible_negative_cost_inputs_are_not_preserved():
+    """Negative cost/premium artifacts should not appear as household facts."""
+    scenario = scenarios_from_cps_frame(
+        pd.DataFrame(
+            [
+                {
+                    "person_id": 1,
+                    "household_id": 1,
+                    "tax_unit_id": 1,
+                    "spm_unit_id": 1,
+                    "family_id": 1,
+                    "household_weight": 1.0,
+                    "state_code": "PA",
+                    "filing_status": "SINGLE",
+                    "age": 35,
+                    "employment_income": 50_000.0,
+                    "has_champva_health_coverage_at_interview": True,
+                    "other_health_insurance_premiums": -5_000.0,
+                    "other_medical_expenses": -100.0,
+                    "partnership_s_corp_income": -2_000.0,
+                    "is_tax_unit_head": True,
+                }
+            ]
+        ),
+        n=1,
+        seed=0,
+    )[0]
+
+    head = scenario.adults[0].inputs
+    assert "has_champva_health_coverage_at_interview" not in head
+    assert "other_health_insurance_premiums" not in head
+    assert "other_medical_expenses" not in head
+    assert head["partnership_s_corp_income"] == -2_000.0
+
+
 def test_formula_overtime_premium_is_not_prompted_or_sent_to_policyengine():
     scenario = scenarios_from_cps_frame(
         pd.DataFrame(

diff --git a/tests/test_snapshot_artifacts.py b/tests/test_snapshot_artifacts.py
@@ -149,16 +149,15 @@ def _aggregate_scenario_metric(country_payload: dict, metric: str) -> dict[str,
             household_score = 0.0
             for variable, model_map in variables:
                 row = model_map[model]
-                household_score += (
-                    raw_row_weights[variable] / denominator
-                ) * row[metric]
+                household_score += (raw_row_weights[variable] / denominator) * row[
+                    metric
+                ]
             entry = totals.setdefault(model, {"score": 0.0, "households": 0.0})
             entry["score"] += household_score
             entry["households"] += 1
 
     return {
-        model: entry["score"] / entry["households"]
-        for model, entry in totals.items()
+        model: entry["score"] / entry["households"] for model, entry in totals.items()
     }