Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/1151.fixed.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Reserve a small share of prior weight for zero-weight PUF clone rows (instead of near-zero) so they stay usable in calibration, and validate that final enhanced CPS weights keep PUF clones above a floor rather than starving them.
95 changes: 72 additions & 23 deletions policyengine_us_data/datasets/cps/enhanced_cps.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from policyengine_us_data.utils import (
ABSOLUTE_ERROR_SCALE_TARGETS,
HOUSEHOLD_COUNT_TARGET,
PUF_CLONE_HOUSEHOLD_COUNT_TARGET_SHARE,
build_loss_matrix,
get_target_error_normalisation,
get_target_loss_weights,
Expand Down Expand Up @@ -44,24 +43,31 @@


HOUSEHOLD_WEIGHT_TOTAL_REL_TOLERANCE = 0.02
PUF_CLONE_HOUSEHOLD_WEIGHT_SHARE_TOLERANCE = 0.10
PERSON_POVERTY_RATE_MIN = 0.05
PERSON_POVERTY_RATE_MAX = 0.25
# PUF clones enter the extended CPS with zero household weight. They are support
# records for calibration, but the earlier bug starved them to ~0 (unusable in
# log-space optimization). Reserve a small but non-trivial share of prior mass
# for them, and validate that final weights keep them above a floor. There is no
# upper cap: the household-count loss target (loss.py) governs how much weight
# clones ultimately carry.
PUF_CLONE_PRIOR_TOTAL_SHARE = 0.05
MIN_PUF_CLONE_HOUSEHOLD_WEIGHT_SHARE_PCT = 5.0
MAX_PUF_CLONE_TAXES_EXCEED_MARKET_INCOME_SHARE_PCT = 25.0


def initialize_weight_priors(
original_weights: np.ndarray,
seed: int = 1456,
epsilon: float = 1e-6,
zero_weight_total_share: float = 0.5,
zero_weight_total_share: float = PUF_CLONE_PRIOR_TOTAL_SHARE,
) -> np.ndarray:
"""Build deterministic positive priors for sparse reweighting.

PUF clone households enter the extended CPS with zero household weight.
Giving those records near-zero priors leaves them effectively unusable in
log-space optimization. When zero-weight rows are present, preserve the
relative distribution of positive survey weights but reserve a fixed share
of the original total household mass for uniform zero-weight-row priors.
Reserve a small but non-trivial share of prior mass for them so they remain
usable in log-space optimization (the earlier bug starved them to ~0). Their
final weight is governed by the household-count loss target, not this prior.
"""

weights = np.asarray(original_weights, dtype=np.float64)
Expand Down Expand Up @@ -135,10 +141,14 @@ def validate_clone_household_weight_share(
household_is_puf_clone: np.ndarray,
*,
year: int,
target_share: float = PUF_CLONE_HOUSEHOLD_COUNT_TARGET_SHARE,
abs_tolerance: float = PUF_CLONE_HOUSEHOLD_WEIGHT_SHARE_TOLERANCE,
min_share: float = MIN_PUF_CLONE_HOUSEHOLD_WEIGHT_SHARE_PCT / 100,
) -> float:
"""Validate that PUF-clone households do not dominate final weights."""
"""Validate that PUF-clone households keep a usable share of final weight.

Clones must not be starved below ``min_share`` (the earlier bug left them at
~0, unusable in log-space optimization). There is no upper cap: the
household-count loss target governs how much weight clones ultimately carry.
"""

weights = np.asarray(weights, dtype=np.float64)
household_is_puf_clone = np.asarray(household_is_puf_clone, dtype=bool)
Expand All @@ -154,12 +164,11 @@ def validate_clone_household_weight_share(
raise ValueError(f"Year {year}: household_weight total must be positive")

clone_share = float(weights[household_is_puf_clone].sum()) / total
if abs(clone_share - target_share) > abs_tolerance:
if clone_share < min_share:
raise ValueError(
f"Year {year}: PUF-clone household weight share "
f"{clone_share:.2%} differs from target {target_share:.2%} by "
f"{abs(clone_share - target_share):.2%}, exceeding "
f"{abs_tolerance:.2%} tolerance"
f"{clone_share:.2%} is below the {min_share:.2%} floor; clones are "
f"being starved of weight"
)

return clone_share
Expand Down Expand Up @@ -201,6 +210,41 @@ def validate_person_poverty_rate(
return poverty_rate


def validate_clone_diagnostics(
diagnostics: dict[str, float],
*,
min_household_weight_share_pct: float = MIN_PUF_CLONE_HOUSEHOLD_WEIGHT_SHARE_PCT,
max_taxes_exceed_market_income_share_pct: float = (
MAX_PUF_CLONE_TAXES_EXCEED_MARKET_INCOME_SHARE_PCT
),
) -> None:
"""Reject enhanced CPS artifacts where PUF support clones are starved.

Enforces a floor on clone household weight share (clones must keep at least
``min_household_weight_share_pct`` of total weight, the earlier bug) plus a
data-quality bound on clones whose imputed taxes exceed market income. There
is no upper cap on weight share: the household-count loss target governs that.
"""

clone_household_share = diagnostics["clone_household_weight_share_pct"]
if clone_household_share < min_household_weight_share_pct:
raise ValueError(
"PUF clone household weight share "
f"{clone_household_share:.1f}% is below the "
f"{min_household_weight_share_pct:.1f}% floor"
)

taxes_exceed_market_income_share = diagnostics[
"clone_taxes_exceed_market_income_share_pct"
]
if taxes_exceed_market_income_share > max_taxes_exceed_market_income_share_pct:
raise ValueError(
"PUF clone taxes-exceed-market-income share "
f"{taxes_exceed_market_income_share:.1f}% exceeds "
f"{max_taxes_exceed_market_income_share_pct:.1f}%"
)


def _to_numpy(value) -> np.ndarray:
return np.asarray(getattr(value, "values", value))

Expand Down Expand Up @@ -351,17 +395,22 @@ def save_clone_diagnostics_report(
end_year: int,
) -> tuple[Path, dict]:
periods = list(range(start_year, end_year + 1))

def build_validated_payload():
period_to_diagnostics = {
period: build_clone_diagnostics_for_saved_dataset(
dataset_cls,
period,
)
for period in periods
}
for diagnostics in period_to_diagnostics.values():
validate_clone_diagnostics(diagnostics)
return build_clone_diagnostics_payload(period_to_diagnostics)

output_path = refresh_clone_diagnostics_report(
dataset_cls.file_path,
lambda: build_clone_diagnostics_payload(
{
period: build_clone_diagnostics_for_saved_dataset(
dataset_cls,
period,
)
for period in periods
}
),
build_validated_payload,
)
diagnostics_payload = json.loads(output_path.read_text())
return output_path, diagnostics_payload
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ classifiers = [
"Programming Language :: Python :: 3.14",
]
dependencies = [
"policyengine-us==1.715.2",
"policyengine-us==1.715.3",
# policyengine-core 3.26.1 is the current 3.26.x runtime and includes the fix for
# PolicyEngine/policyengine-core#482 (user-set ETERNITY inputs lost
# after _invalidate_all_caches) and is required by policyengine-us 1.682.1+.
Expand Down
13 changes: 8 additions & 5 deletions tests/unit/datasets/test_enhanced_cps_seeding.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
Earlier versions used global ``np.random.normal(1, 0.1, ...)`` jitter before
``reweight()`` reseeded the optimizer. Current code routes both dense CPS
weighting paths through ``initialize_weight_priors()``, which preserves positive
survey weight shape and gives zero-weight clone records deterministic uniform
survey weight shape and gives zero-weight clone records deterministic support
prior mass.
"""

Expand Down Expand Up @@ -86,11 +86,13 @@ def test_validate_household_weight_total_rejects_inflated_total():
)


def test_validate_clone_household_weight_share_accepts_target_share():
def test_validate_clone_household_weight_share_accepts_healthy_share():
from policyengine_us_data.datasets.cps.enhanced_cps import (
validate_clone_household_weight_share,
)

# A high clone share is fine: there is no upper cap (the loss target governs
# how much weight clones carry); the guard only enforces a floor.
share = validate_clone_household_weight_share(
np.array([40_000_000.0, 10_000_000.0, 25_000_000.0, 25_000_000.0]),
np.array([False, False, True, True]),
Expand All @@ -100,14 +102,15 @@ def test_validate_clone_household_weight_share_accepts_target_share():
assert share == pytest.approx(0.5)


def test_validate_clone_household_weight_share_rejects_clone_dominance():
def test_validate_clone_household_weight_share_rejects_clone_starvation():
from policyengine_us_data.datasets.cps.enhanced_cps import (
validate_clone_household_weight_share,
)

with pytest.raises(ValueError, match="PUF-clone household weight share"):
# Clones starved to ~2.4% of weight (below the 5% floor) must fail.
with pytest.raises(ValueError, match="floor"):
validate_clone_household_weight_share(
np.array([10_000_000.0, 10_000_000.0, 40_000_000.0, 40_000_000.0]),
np.array([80_000_000.0, 80_000_000.0, 2_000_000.0, 2_000_000.0]),
np.array([False, False, True, True]),
year=2024,
)
Expand Down
105 changes: 99 additions & 6 deletions tests/unit/test_enhanced_cps_clone_diagnostics.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,20 +9,26 @@
compute_clone_diagnostics_summary,
clone_diagnostics_path,
initialize_weight_priors,
PUF_CLONE_PRIOR_TOTAL_SHARE,
refresh_clone_diagnostics_report,
save_clone_diagnostics_report,
validate_clone_diagnostics,
)


def test_initialize_weight_priors_gives_zero_weight_records_balanced_mass():
def test_initialize_weight_priors_gives_zero_weight_records_support_mass():
weights = np.array([1_500.0, 0.0, 625.0, 0.0], dtype=np.float64)

priors = initialize_weight_priors(weights, seed=123)

assert np.all(priors > 0)
assert priors.sum() == pytest.approx(weights.sum())
assert priors[[0, 2]].sum() == pytest.approx(weights.sum() / 2)
assert priors[[1, 3]].sum() == pytest.approx(weights.sum() / 2)
assert priors[[1, 3]].sum() == pytest.approx(
weights.sum() * PUF_CLONE_PRIOR_TOTAL_SHARE
)
assert priors[[0, 2]].sum() == pytest.approx(
weights.sum() * (1 - PUF_CLONE_PRIOR_TOTAL_SHARE)
)
assert priors[1] == pytest.approx(priors[3])
assert priors[0] / priors[2] == pytest.approx(weights[0] / weights[2])

Expand All @@ -44,6 +50,15 @@ def test_initialize_weight_priors_is_reproducible():
np.testing.assert_allclose(priors_a, priors_b)


def test_initialize_weight_priors_honors_configured_zero_weight_share():
weights = np.array([80.0, 20.0, 0.0, 0.0])

priors = initialize_weight_priors(weights, zero_weight_total_share=0.5)

np.testing.assert_allclose(priors.sum(), 100.0)
np.testing.assert_allclose(priors, np.array([40.0, 10.0, 25.0, 25.0]))


def test_compute_clone_diagnostics_summary():
diagnostics = compute_clone_diagnostics_summary(
household_is_puf_clone=[False, True],
Expand All @@ -70,6 +85,49 @@ def test_compute_clone_diagnostics_summary():
)


def test_validate_clone_diagnostics_accepts_support_clone_share():
validate_clone_diagnostics(
{
"clone_household_weight_share_pct": 10.0,
"clone_taxes_exceed_market_income_share_pct": 5.0,
}
)


def test_validate_clone_diagnostics_rejects_clone_starvation():
with pytest.raises(ValueError, match="floor"):
validate_clone_diagnostics(
{
"clone_household_weight_share_pct": 2.0,
"clone_taxes_exceed_market_income_share_pct": 5.0,
}
)


def test_validate_clone_diagnostics_accepts_high_share_no_cap():
# No upper cap on clone weight share (the household-count loss target governs
# it); a high share with healthy tax quality must pass.
validate_clone_diagnostics(
{
"clone_household_weight_share_pct": 81.3,
"clone_taxes_exceed_market_income_share_pct": 5.0,
}
)


def test_validate_clone_diagnostics_rejects_clone_tax_pathology():
with pytest.raises(
ValueError,
match="PUF clone taxes-exceed-market-income share",
):
validate_clone_diagnostics(
{
"clone_household_weight_share_pct": 10.0,
"clone_taxes_exceed_market_income_share_pct": 66.6,
}
)


def test_build_clone_diagnostics_for_simulation_maps_household_weights(
monkeypatch,
):
Expand Down Expand Up @@ -201,7 +259,11 @@ class DummyDataset:

monkeypatch.setattr(
"policyengine_us_data.datasets.cps.enhanced_cps.build_clone_diagnostics_for_saved_dataset",
lambda dataset_cls, period: {"clone_person_weight_share_pct": float(period)},
lambda dataset_cls, period: {
"clone_person_weight_share_pct": float(period),
"clone_household_weight_share_pct": 10.0,
"clone_taxes_exceed_market_income_share_pct": 5.0,
},
)

output_path, payload = save_clone_diagnostics_report(
Expand All @@ -213,8 +275,39 @@ class DummyDataset:
assert output_path == clone_diagnostics_path(DummyDataset.file_path)
assert payload == {
"periods": {
"2024": {"clone_person_weight_share_pct": 2024.0},
"2025": {"clone_person_weight_share_pct": 2025.0},
"2024": {
"clone_person_weight_share_pct": 2024.0,
"clone_household_weight_share_pct": 10.0,
"clone_taxes_exceed_market_income_share_pct": 5.0,
},
"2025": {
"clone_person_weight_share_pct": 2025.0,
"clone_household_weight_share_pct": 10.0,
"clone_taxes_exceed_market_income_share_pct": 5.0,
},
}
}
assert output_path.exists()


def test_save_clone_diagnostics_report_rejects_bad_clone_payload(tmp_path, monkeypatch):
class DummyDataset:
file_path = tmp_path / "enhanced_cps_2024.h5"

DummyDataset.file_path.write_text("placeholder")

monkeypatch.setattr(
"policyengine_us_data.datasets.cps.enhanced_cps.build_clone_diagnostics_for_saved_dataset",
lambda dataset_cls, period: {
"clone_person_weight_share_pct": 1.0,
"clone_household_weight_share_pct": 2.0,
"clone_taxes_exceed_market_income_share_pct": 5.0,
},
)

with pytest.raises(ValueError, match="PUF clone household weight share"):
save_clone_diagnostics_report(
DummyDataset,
start_year=2024,
end_year=2024,
)
8 changes: 4 additions & 4 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading
Loading