From 1c963d1542928b4219b05fd3073825ba52b18753 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Wed, 27 May 2026 08:18:14 -0400 Subject: [PATCH 1/2] Add PolicyBench pre-release banner --- app/src/components/Hero.tsx | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/app/src/components/Hero.tsx b/app/src/components/Hero.tsx index 9fac047..364492a 100644 --- a/app/src/components/Hero.tsx +++ b/app/src/components/Hero.tsx @@ -3,6 +3,22 @@ import SiteHeader, { type HeaderNavItem } from "./SiteHeader"; const SNAPSHOT_DATE_LABEL = "Snapshot 2026-05-20"; +function PreReleaseBanner() { + return ( +
+
+ + Pre-release + + + These results are provisional. We plan to rerun PolicyBench with + updated data and improved prompts before the final release. + +
+
+ ); +} + export default function Hero({ selectedView, onSelectView, @@ -44,6 +60,7 @@ export default function Hero({ availableViews={availableViews} actionLink={{ label: "Paper", href: "/paper", type: "internal" }} /> +
Date: Wed, 27 May 2026 08:21:15 -0400 Subject: [PATCH 2/2] Format Python files for CI --- policybench/analysis.py | 21 +++++++++++---------- tests/test_analysis.py | 4 +--- tests/test_snapshot_artifacts.py | 9 ++++----- 3 files changed, 16 insertions(+), 18 deletions(-) diff --git a/policybench/analysis.py b/policybench/analysis.py index fbf49ad..f6a439b 100644 --- a/policybench/analysis.py +++ b/policybench/analysis.py @@ -801,9 +801,7 @@ def _match(row): pred_flag = binary_flag(pred) ref_flag = binary_flag(ref) return ( - pred_flag is not None - and ref_flag is not None - and pred_flag == ref_flag + pred_flag is not None and ref_flag is not None and pred_flag == ref_flag ) return (float(ref) == 0.0) == (float(pred) == 0.0) @@ -1245,14 +1243,17 @@ def compute_metrics( row["threshold_score"] = float( np.mean([exact, within_1pct, within_5pct, within_10pct]) ) - row["score"] = float( - np.mean( - [ - bounded_row_score(variable, truth, pred) - for truth, pred in zip(y_true, y_pred, strict=True) - ] + row["score"] = ( + float( + np.mean( + [ + bounded_row_score(variable, truth, pred) + for truth, pred in zip(y_true, y_pred, strict=True) + ] + ) ) - ) * coverage + * coverage + ) rows.append(row) diff --git a/tests/test_analysis.py b/tests/test_analysis.py index 620594c..82c53ce 100644 --- a/tests/test_analysis.py +++ b/tests/test_analysis.py @@ -764,9 +764,7 @@ def test_build_dashboard_payload_matches_frontend_shape(self): assert "within10pctRunMean" not in payload["modelStats"][0] assert payload["heatmap"][0]["condition"] == "no_tools" income_program = next( - row - for row in payload["programStats"] - if row["variable"] == "income_tax" + row for row in payload["programStats"] if row["variable"] == "income_tax" ) assert income_program["score"] == pytest.approx(92.5) assert income_program["thresholdScore"] == pytest.approx(37.5) diff --git a/tests/test_snapshot_artifacts.py b/tests/test_snapshot_artifacts.py index a5a7bc4..478b1ca 100644 --- a/tests/test_snapshot_artifacts.py +++ b/tests/test_snapshot_artifacts.py @@ -149,16 +149,15 @@ def _aggregate_scenario_metric(country_payload: dict, metric: str) -> dict[str, household_score = 0.0 for variable, model_map in variables: row = model_map[model] - household_score += ( - raw_row_weights[variable] / denominator - ) * row[metric] + household_score += (raw_row_weights[variable] / denominator) * row[ + metric + ] entry = totals.setdefault(model, {"score": 0.0, "households": 0.0}) entry["score"] += household_score entry["households"] += 1 return { - model: entry["score"] / entry["households"] - for model, entry in totals.items() + model: entry["score"] / entry["households"] for model, entry in totals.items() }