From b7db2ceca0f124f9b784f2ff47fac2c0e54e3800 Mon Sep 17 00:00:00 2001
From: Max Ghenis <mghenis@gmail.com>
Date: Mon, 25 May 2026 10:42:30 -0400
Subject: [PATCH] Fix lingering PolicyBench prompt artifacts

---
 policybench/analysis.py          | 21 ++++++++++---------
 policybench/benchmark_specs.json |  2 +-
 policybench/eval_no_tools.py     |  5 +++++
 policybench/scenarios.py         | 28 +++++++++++++++++++++++++
 tests/test_analysis.py           |  4 +---
 tests/test_eval_no_tools.py      | 10 ++++++++-
 tests/test_scenarios.py          | 35 ++++++++++++++++++++++++++++++++
 tests/test_snapshot_artifacts.py |  9 ++++----
 8 files changed, 94 insertions(+), 20 deletions(-)

diff --git a/policybench/analysis.py b/policybench/analysis.py
index fbf49ad..f6a439b 100644
--- a/policybench/analysis.py
+++ b/policybench/analysis.py
@@ -801,9 +801,7 @@ def _match(row):
             pred_flag = binary_flag(pred)
             ref_flag = binary_flag(ref)
             return (
-                pred_flag is not None
-                and ref_flag is not None
-                and pred_flag == ref_flag
+                pred_flag is not None and ref_flag is not None and pred_flag == ref_flag
             )
         return (float(ref) == 0.0) == (float(pred) == 0.0)
 
@@ -1245,14 +1243,17 @@ def compute_metrics(
             row["threshold_score"] = float(
                 np.mean([exact, within_1pct, within_5pct, within_10pct])
             )
-            row["score"] = float(
-                np.mean(
-                    [
-                        bounded_row_score(variable, truth, pred)
-                        for truth, pred in zip(y_true, y_pred, strict=True)
-                    ]
+            row["score"] = (
+                float(
+                    np.mean(
+                        [
+                            bounded_row_score(variable, truth, pred)
+                            for truth, pred in zip(y_true, y_pred, strict=True)
+                        ]
+                    )
                 )
-            ) * coverage
+                * coverage
+            )
 
         rows.append(row)
 
diff --git a/policybench/benchmark_specs.json b/policybench/benchmark_specs.json
index dce2f24..8f19dca 100644
--- a/policybench/benchmark_specs.json
+++ b/policybench/benchmark_specs.json
@@ -119,7 +119,7 @@
                         "id": "premium_tax_credit",
                         "pe_variable": "premium_tax_credit",
                         "label": "ACA Premium Tax Credit",
-                        "prompt": "annual ACA Premium Tax Credit amount for Marketplace health insurance premium assistance. Use any listed Marketplace plan information as what the household knows about the plan they selected. Estimate any needed local Marketplace benchmark premium from the household facts if it is not provided. If no selected Marketplace plan information is listed, assume the selected plan costs about the same as the local benchmark Silver plan. Return 0 if the household is ineligible or does not receive Marketplace premium assistance",
+                        "prompt": "annual ACA Premium Tax Credit amount for Marketplace health insurance premium assistance. For this output, assume Marketplace enrollment and credit take-up if the household is ACA-eligible; do not return 0 merely because Marketplace enrollment, the local benchmark premium, or a selected-plan premium is not listed. Use any listed Marketplace plan information as what the household knows about the plan they selected. Estimate any needed local Marketplace benchmark premium from the household facts if it is not provided. If no selected Marketplace plan information is listed, assume the selected plan costs about the same as the local benchmark Silver plan. Return 0 only if the household is ineligible for Marketplace premium assistance under ACA rules or listed facts show disqualifying coverage",
                         "metric_type": "amount",
                         "role": "health",
                         "output_set": "excluded_prompt_issue",
diff --git a/policybench/eval_no_tools.py b/policybench/eval_no_tools.py
index e042d7b..888eb06 100644
--- a/policybench/eval_no_tools.py
+++ b/policybench/eval_no_tools.py
@@ -42,6 +42,9 @@ def _env_int(name: str, default: int) -> int:
 GEMINI_PRO_REQUEST_TIMEOUT_SECONDS = _env_int(
     "POLICYBENCH_GEMINI_PRO_REQUEST_TIMEOUT_SECONDS", 60
 )
+CLAUDE_REQUEST_TIMEOUT_SECONDS = _env_int(
+    "POLICYBENCH_CLAUDE_REQUEST_TIMEOUT_SECONDS", 90
+)
 XAI_REQUEST_TIMEOUT_SECONDS = _env_int("POLICYBENCH_XAI_REQUEST_TIMEOUT_SECONDS", 420)
 REQUEST_WALL_TIMEOUT_GRACE_SECONDS = 30
 REQUEST_WALL_TIMEOUT_MULTIPLIER = 1.5
@@ -365,6 +368,8 @@ def _request_timeout_seconds(model_id: str) -> int:
         return GEMINI_PRO_REQUEST_TIMEOUT_SECONDS
     if model_id == "gpt-5.5":
         return GEMINI_PRO_REQUEST_TIMEOUT_SECONDS
+    if model_id.startswith("claude-"):
+        return CLAUDE_REQUEST_TIMEOUT_SECONDS
     if model_id.startswith("xai/"):
         return XAI_REQUEST_TIMEOUT_SECONDS
     return REQUEST_TIMEOUT_SECONDS
diff --git a/policybench/scenarios.py b/policybench/scenarios.py
index a3087c4..e8c8ce3 100644
--- a/policybench/scenarios.py
+++ b/policybench/scenarios.py
@@ -90,6 +90,7 @@
     "employer_quarterly_payroll_expense_override",
     "employer_state_unemployment_tax_rate_override",
     "has_itin",
+    "has_champva_health_coverage_at_interview",
     "has_marketplace_health_coverage",
     "has_marketplace_health_coverage_at_interview",
     "has_medicaid_health_coverage_at_interview",
@@ -220,6 +221,31 @@
     "long_term_capital_gains",
 }
 
+# These fields represent costs, premiums, balances, or expenses that cannot be
+# negative in ordinary household facts. Some survey/imputation artifacts encode
+# negative values; omit those rather than asking models to reason from impossible
+# prompt facts. Loss-like income fields, such as rental or partnership income,
+# remain promptable when negative.
+NONNEGATIVE_PROMPT_INPUTS = {
+    "auto_loan_balance",
+    "auto_loan_interest",
+    "charitable_cash_donations",
+    "charitable_non_cash_donations",
+    "deductible_mortgage_interest",
+    "employer_sponsored_insurance_premiums",
+    "first_home_mortgage_balance",
+    "first_home_mortgage_interest",
+    "health_insurance_premiums_without_medicare_part_b",
+    "home_mortgage_interest",
+    "household_vehicles_value",
+    "medicare_part_b_premiums",
+    "other_health_insurance_premiums",
+    "other_medical_expenses",
+    "over_the_counter_health_expenses",
+    "pre_subsidy_rent",
+    "real_estate_taxes",
+}
+
 BASE_CPS_COLUMNS = {
     "person_id": "person_id",
     "household_id": "household_id",
@@ -1056,6 +1082,8 @@ def _extract_entity_inputs(
             continue
 
         value = float(row[spec.output_name])
+        if spec.output_name in NONNEGATIVE_PROMPT_INPUTS and value < 0:
+            continue
         if (
             spec.default_value is not None
             and abs(value - float(spec.default_value)) <= 1e-6
diff --git a/tests/test_analysis.py b/tests/test_analysis.py
index 620594c..82c53ce 100644
--- a/tests/test_analysis.py
+++ b/tests/test_analysis.py
@@ -764,9 +764,7 @@ def test_build_dashboard_payload_matches_frontend_shape(self):
         assert "within10pctRunMean" not in payload["modelStats"][0]
         assert payload["heatmap"][0]["condition"] == "no_tools"
         income_program = next(
-            row
-            for row in payload["programStats"]
-            if row["variable"] == "income_tax"
+            row for row in payload["programStats"] if row["variable"] == "income_tax"
         )
         assert income_program["score"] == pytest.approx(92.5)
         assert income_program["thresholdScore"] == pytest.approx(37.5)
diff --git a/tests/test_eval_no_tools.py b/tests/test_eval_no_tools.py
index 12fdc61..9d5ca46 100644
--- a/tests/test_eval_no_tools.py
+++ b/tests/test_eval_no_tools.py
@@ -646,10 +646,12 @@ def test_premium_tax_credit_prompt_places_ptc_in_health(mini_scenario):
     prompt_lower = prompt.lower()
     assert "aca premium tax credit" in prompt_lower
     assert "marketplace health insurance premium assistance" in prompt_lower
+    assert "assume marketplace enrollment and credit take-up" in prompt_lower
+    assert "do not return 0 merely because marketplace enrollment" in prompt_lower
     assert "what the household knows about the plan they selected" in prompt_lower
     assert "benchmark premium" in prompt_lower
     assert "selected plan costs about the same" in prompt_lower
-    assert "return 0 if" in prompt_lower
+    assert "return 0 only if" in prompt_lower
 
 
 def test_compact_tax_breakdown_prompts_are_explicit(mini_scenario):
@@ -1495,6 +1497,12 @@ def test_gpt_55_uses_longer_full_output_timeout():
     assert _request_timeout_seconds("gpt-5.5") == 60
 
 
+def test_claude_models_use_longer_timeout():
+    """Claude explanation calls can exceed the generic 20s request timeout."""
+    assert _request_timeout_seconds("claude-sonnet-4.6") == 90
+    assert _request_timeout_seconds("claude-opus-4.7") == 90
+
+
 def test_xai_models_use_longer_timeout():
     """xAI models need more than the generic 20s request timeout."""
     assert _request_timeout_seconds("xai/grok-4.3") == 420
diff --git a/tests/test_scenarios.py b/tests/test_scenarios.py
index e06dc57..211a686 100644
--- a/tests/test_scenarios.py
+++ b/tests/test_scenarios.py
@@ -661,6 +661,41 @@ def test_aggregate_net_worth_input_is_not_preserved():
     assert "net_worth" not in scenario.household_inputs
 
 
+def test_impossible_negative_cost_inputs_are_not_preserved():
+    """Negative cost/premium artifacts should not appear as household facts."""
+    scenario = scenarios_from_cps_frame(
+        pd.DataFrame(
+            [
+                {
+                    "person_id": 1,
+                    "household_id": 1,
+                    "tax_unit_id": 1,
+                    "spm_unit_id": 1,
+                    "family_id": 1,
+                    "household_weight": 1.0,
+                    "state_code": "PA",
+                    "filing_status": "SINGLE",
+                    "age": 35,
+                    "employment_income": 50_000.0,
+                    "has_champva_health_coverage_at_interview": True,
+                    "other_health_insurance_premiums": -5_000.0,
+                    "other_medical_expenses": -100.0,
+                    "partnership_s_corp_income": -2_000.0,
+                    "is_tax_unit_head": True,
+                }
+            ]
+        ),
+        n=1,
+        seed=0,
+    )[0]
+
+    head = scenario.adults[0].inputs
+    assert "has_champva_health_coverage_at_interview" not in head
+    assert "other_health_insurance_premiums" not in head
+    assert "other_medical_expenses" not in head
+    assert head["partnership_s_corp_income"] == -2_000.0
+
+
 def test_formula_overtime_premium_is_not_prompted_or_sent_to_policyengine():
     scenario = scenarios_from_cps_frame(
         pd.DataFrame(
diff --git a/tests/test_snapshot_artifacts.py b/tests/test_snapshot_artifacts.py
index a5a7bc4..478b1ca 100644
--- a/tests/test_snapshot_artifacts.py
+++ b/tests/test_snapshot_artifacts.py
@@ -149,16 +149,15 @@ def _aggregate_scenario_metric(country_payload: dict, metric: str) -> dict[str,
             household_score = 0.0
             for variable, model_map in variables:
                 row = model_map[model]
-                household_score += (
-                    raw_row_weights[variable] / denominator
-                ) * row[metric]
+                household_score += (raw_row_weights[variable] / denominator) * row[
+                    metric
+                ]
             entry = totals.setdefault(model, {"score": 0.0, "households": 0.0})
             entry["score"] += household_score
             entry["households"] += 1
 
     return {
-        model: entry["score"] / entry["households"]
-        for model, entry in totals.items()
+        model: entry["score"] / entry["households"] for model, entry in totals.items()
     }