Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions app/src/components/Hero.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,22 @@ import SiteHeader, { type HeaderNavItem } from "./SiteHeader";

const SNAPSHOT_DATE_LABEL = "Snapshot 2026-05-20";

function PreReleaseBanner() {
return (
<div className="border-y border-warning/30 bg-warning-soft/90">
<div className="mx-auto flex max-w-7xl flex-col gap-1 px-4 py-3 text-sm text-text sm:flex-row sm:items-center sm:px-6">
<span className="font-[family-name:var(--font-mono)] text-[11px] font-semibold uppercase tracking-[0.12em] text-warning-text">
Pre-release
</span>
<span className="text-text-secondary">
These results are provisional. We plan to rerun PolicyBench with
updated data and improved prompts before the final release.
</span>
</div>
</div>
);
}

export default function Hero({
selectedView,
onSelectView,
Expand Down Expand Up @@ -44,6 +60,7 @@ export default function Hero({
availableViews={availableViews}
actionLink={{ label: "Paper", href: "/paper", type: "internal" }}
/>
<PreReleaseBanner />

<section
aria-labelledby="hero-title"
Expand Down
21 changes: 11 additions & 10 deletions policybench/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -801,9 +801,7 @@ def _match(row):
pred_flag = binary_flag(pred)
ref_flag = binary_flag(ref)
return (
pred_flag is not None
and ref_flag is not None
and pred_flag == ref_flag
pred_flag is not None and ref_flag is not None and pred_flag == ref_flag
)
return (float(ref) == 0.0) == (float(pred) == 0.0)

Expand Down Expand Up @@ -1245,14 +1243,17 @@ def compute_metrics(
row["threshold_score"] = float(
np.mean([exact, within_1pct, within_5pct, within_10pct])
)
row["score"] = float(
np.mean(
[
bounded_row_score(variable, truth, pred)
for truth, pred in zip(y_true, y_pred, strict=True)
]
row["score"] = (
float(
np.mean(
[
bounded_row_score(variable, truth, pred)
for truth, pred in zip(y_true, y_pred, strict=True)
]
)
)
) * coverage
* coverage
)

rows.append(row)

Expand Down
4 changes: 1 addition & 3 deletions tests/test_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -764,9 +764,7 @@ def test_build_dashboard_payload_matches_frontend_shape(self):
assert "within10pctRunMean" not in payload["modelStats"][0]
assert payload["heatmap"][0]["condition"] == "no_tools"
income_program = next(
row
for row in payload["programStats"]
if row["variable"] == "income_tax"
row for row in payload["programStats"] if row["variable"] == "income_tax"
)
assert income_program["score"] == pytest.approx(92.5)
assert income_program["thresholdScore"] == pytest.approx(37.5)
Expand Down
9 changes: 4 additions & 5 deletions tests/test_snapshot_artifacts.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,16 +149,15 @@ def _aggregate_scenario_metric(country_payload: dict, metric: str) -> dict[str,
household_score = 0.0
for variable, model_map in variables:
row = model_map[model]
household_score += (
raw_row_weights[variable] / denominator
) * row[metric]
household_score += (raw_row_weights[variable] / denominator) * row[
metric
]
entry = totals.setdefault(model, {"score": 0.0, "households": 0.0})
entry["score"] += household_score
entry["households"] += 1

return {
model: entry["score"] / entry["households"]
for model, entry in totals.items()
model: entry["score"] / entry["households"] for model, entry in totals.items()
}


Expand Down