From a1232ba439086095dac56e5a7212605d8108a6e9 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Wed, 27 May 2026 21:07:25 -0400 Subject: [PATCH 1/4] Fail ECPS calibration on household count drift --- changelog.d/1150.fixed.md | 1 + .../datasets/cps/enhanced_cps.py | 77 +++++++++++++++---- pyproject.toml | 2 +- .../datasets/test_enhanced_cps_seeding.py | 28 +++++++ uv.lock | 8 +- 5 files changed, 94 insertions(+), 22 deletions(-) create mode 100644 changelog.d/1150.fixed.md diff --git a/changelog.d/1150.fixed.md b/changelog.d/1150.fixed.md new file mode 100644 index 000000000..2a615567b --- /dev/null +++ b/changelog.d/1150.fixed.md @@ -0,0 +1 @@ +Fail Enhanced CPS calibration when final household weights drift from the source household count. diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index de0d0b58e..2a5f2c142 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -42,6 +42,9 @@ torch = None +HOUSEHOLD_WEIGHT_TOTAL_REL_TOLERANCE = 0.02 + + def initialize_weight_priors( original_weights: np.ndarray, seed: int = 1456, @@ -81,6 +84,48 @@ def initialize_weight_priors( return priors +def validate_household_weight_total( + weights: np.ndarray, + *, + source_total: float, + year: int, + rel_tolerance: float = HOUSEHOLD_WEIGHT_TOTAL_REL_TOLERANCE, +) -> float: + """Validate calibrated household weights against the source total.""" + + weights = np.asarray(weights) + if np.any(np.isnan(weights)): + raise ValueError(f"Year {year}: household_weight contains NaN values") + if np.any(weights < 0): + raise ValueError(f"Year {year}: household_weight contains negative values") + + weighted_hh_count = float(np.sum(weights)) + if not (1e8 <= weighted_hh_count <= 2e8): + raise ValueError( + f"Year {year}: weighted household count " + f"{weighted_hh_count:,.0f} outside expected range " + f"[100M, 200M]" + ) + + source_total = float(source_total) + if not np.isfinite(source_total) or source_total <= 0: + raise ValueError( + f"Year {year}: source household count total must be positive; " + f"got {source_total:,.0f}" + ) + + rel_error = abs(weighted_hh_count - source_total) / source_total + if rel_error > rel_tolerance: + raise ValueError( + f"Year {year}: weighted household count " + f"{weighted_hh_count:,.0f} differs from source household count " + f"{source_total:,.0f} by {rel_error:.2%}, exceeding " + f"{rel_tolerance:.2%} tolerance" + ) + + return weighted_hh_count + + def _to_numpy(value) -> np.ndarray: return np.asarray(getattr(value, "values", value)) @@ -639,6 +684,7 @@ def generate(self): data["household_weight"] = {} original_weights = sim.calculate("household_weight") original_weights = initialize_weight_priors(original_weights.values) + source_household_count = float(np.sum(original_weights)) bad_targets = [ "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household", @@ -688,25 +734,16 @@ def generate(self): ) data["household_weight"][year] = optimised_weights - # Validate dense weights - w = optimised_weights - if np.any(np.isnan(w)): - raise ValueError(f"Year {year}: household_weight contains NaN values") - if np.any(w < 0): - raise ValueError( - f"Year {year}: household_weight contains negative values" - ) - weighted_hh_count = float(np.sum(w)) - if not (1e8 <= weighted_hh_count <= 2e8): - raise ValueError( - f"Year {year}: weighted household count " - f"{weighted_hh_count:,.0f} outside expected range " - f"[100M, 200M]" - ) + weighted_hh_count = validate_household_weight_total( + optimised_weights, + source_total=source_household_count, + year=year, + ) logging.info( f"Year {year}: weights validated — " - f"{weighted_hh_count:,.0f} weighted households, " - f"{int(np.sum(w > 0))} non-zero" + f"{weighted_hh_count:,.0f} weighted households " + f"vs {source_household_count:,.0f} source households, " + f"{int(np.sum(optimised_weights > 0))} non-zero" ) if 2025 in ACA_POST_CALIBRATION_PERSON_TARGETS: @@ -824,9 +861,15 @@ def generate(self): data = sim.dataset.load_dataset() original_weights = sim.calculate("household_weight") original_weights = initialize_weight_priors(original_weights.values) + source_household_count = float(np.sum(original_weights)) for year in [2024]: loss_matrix, targets_array = build_loss_matrix(self.input_dataset, year) optimised_weights = reweight(original_weights, loss_matrix, targets_array) + validate_household_weight_total( + optimised_weights, + source_total=source_household_count, + year=year, + ) data["household_weight"] = optimised_weights self.save_dataset(data) diff --git a/pyproject.toml b/pyproject.toml index 975e75829..4e3259399 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,7 +22,7 @@ classifiers = [ "Programming Language :: Python :: 3.14", ] dependencies = [ - "policyengine-us==1.711.0", + "policyengine-us==1.712.0", # policyengine-core 3.26.1 is the current 3.26.x runtime and includes the fix for # PolicyEngine/policyengine-core#482 (user-set ETERNITY inputs lost # after _invalidate_all_caches) and is required by policyengine-us 1.682.1+. diff --git a/tests/unit/datasets/test_enhanced_cps_seeding.py b/tests/unit/datasets/test_enhanced_cps_seeding.py index 11e7e457d..2e386f942 100644 --- a/tests/unit/datasets/test_enhanced_cps_seeding.py +++ b/tests/unit/datasets/test_enhanced_cps_seeding.py @@ -8,6 +8,7 @@ """ import numpy as np +import pytest from policyengine_us_data.utils.seed import set_seeds @@ -56,3 +57,30 @@ def test_initialize_weight_priors_preserves_source_weight_total(): np.testing.assert_allclose(priors.sum(), 100.0) np.testing.assert_allclose(priors, np.array([40.0, 10.0, 25.0, 25.0])) + + +def test_validate_household_weight_total_accepts_close_total(): + from policyengine_us_data.datasets.cps.enhanced_cps import ( + validate_household_weight_total, + ) + + total = validate_household_weight_total( + np.array([50_000_000.0, 96_000_000.0]), + source_total=145_000_000.0, + year=2024, + ) + + assert total == 146_000_000.0 + + +def test_validate_household_weight_total_rejects_inflated_total(): + from policyengine_us_data.datasets.cps.enhanced_cps import ( + validate_household_weight_total, + ) + + with pytest.raises(ValueError, match="differs from source household count"): + validate_household_weight_total( + np.array([100_000_000.0, 86_900_000.0]), + source_total=145_000_000.0, + year=2024, + ) diff --git a/uv.lock b/uv.lock index 71f4a0242..891d69404 100644 --- a/uv.lock +++ b/uv.lock @@ -2164,7 +2164,7 @@ wheels = [ [[package]] name = "policyengine-us" -version = "1.711.0" +version = "1.712.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "microdf-python" }, @@ -2174,9 +2174,9 @@ dependencies = [ { name = "tables" }, { name = "tqdm" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/48/ed/8825980a62e009610d6fa36f55f6c8a32deb0fb770d1f3513e2df9c7f7fe/policyengine_us-1.711.0.tar.gz", hash = "sha256:c52c8e68f3a01ee5935320175e841459503e67f84c41899f9768f4a5b300b4a3", size = 9956103, upload-time = "2026-05-27T21:31:17.868Z" } +sdist = { url = "https://files.pythonhosted.org/packages/94/de/1c6ed33b769f12a29ee148eaf73399fa11b48d726f8920b513657c4ef2f5/policyengine_us-1.712.0.tar.gz", hash = "sha256:821f9d25fa1893d1e95b090868983f56281a448eec82bb7b11ec5d08814c8e39", size = 9957228, upload-time = "2026-05-27T22:52:32.555Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/ff/aa/3e8471c852c75ecc7c2cbbdaedf79b70a8d207df7f689abfd2b3b570bd7a/policyengine_us-1.711.0-py3-none-any.whl", hash = "sha256:e37d7ee5926954ecf9e03d91ccd190a1609e6322426c12fd6cdd867a913ee2d9", size = 10887738, upload-time = "2026-05-27T21:31:14.859Z" }, + { url = "https://files.pythonhosted.org/packages/03/b0/0073cf07946c52ab7c7098e4570362bfa4c285070014a530aee02bfd452d/policyengine_us-1.712.0-py3-none-any.whl", hash = "sha256:618450b2eca7b15ff73530d0f67fa4d1064104c46d155fc3ecc7f40b1c178956", size = 10887739, upload-time = "2026-05-27T22:52:27.786Z" }, ] [[package]] @@ -2246,7 +2246,7 @@ requires-dist = [ { name = "pandas", specifier = ">=2.3.1" }, { name = "pip-system-certs", specifier = ">=3.0" }, { name = "policyengine-core", specifier = ">=3.26.1,<3.27" }, - { name = "policyengine-us", specifier = "==1.711.0" }, + { name = "policyengine-us", specifier = "==1.712.0" }, { name = "requests", specifier = ">=2.25.0" }, { name = "scipy", specifier = ">=1.15.3" }, { name = "setuptools", specifier = ">=60" }, From 0df3d2ad5d619dc270eaa77c06287f15f5b6f3d0 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Thu, 28 May 2026 21:57:44 -0400 Subject: [PATCH 2/4] Fix Enhanced CPS PUF clone calibration guards --- changelog.d/1150.fixed.md | 2 +- .../calibration/puf_impute.py | 167 +++++++++ .../datasets/cps/enhanced_cps.py | 92 +++++ policyengine_us_data/utils/__init__.py | 3 + policyengine_us_data/utils/loss.py | 61 +++- .../test_calibration_puf_impute.py | 324 ++++++++++++++++++ tests/unit/calibration/test_loss_targets.py | 36 ++ .../datasets/test_enhanced_cps_seeding.py | 66 ++++ 8 files changed, 747 insertions(+), 4 deletions(-) diff --git a/changelog.d/1150.fixed.md b/changelog.d/1150.fixed.md index 2a615567b..3b53b925c 100644 --- a/changelog.d/1150.fixed.md +++ b/changelog.d/1150.fixed.md @@ -1 +1 @@ -Fail Enhanced CPS calibration when final household weights drift from the source household count. +Fix Enhanced CPS PUF-clone calibration by anchoring source household weights and excluding Forbes-scale PUF donors from clone training. diff --git a/policyengine_us_data/calibration/puf_impute.py b/policyengine_us_data/calibration/puf_impute.py index cda23d446..f8c500e68 100644 --- a/policyengine_us_data/calibration/puf_impute.py +++ b/policyengine_us_data/calibration/puf_impute.py @@ -40,6 +40,35 @@ PUF_SUBSAMPLE_TARGET = 20_000 PUF_TOP_PERCENTILE = 99.5 +FORBES_SYNTHETIC_FINANCIAL_THRESHOLD = 250_000_000 +PUF_METADATA_MISSING_TOP_TAIL_THRESHOLD = 10_000_000 +FORBES_METADATA_VARIABLES = ( + "forbes_unit_id", + "forbes_replicate_id", + "forbes_rank", +) +PUF_METADATA_MISSING_TOP_TAIL_VARIABLES = ( + "adjusted_gross_income", + "qualified_dividend_income", + "non_qualified_dividend_income", + "taxable_interest_income", + "tax_exempt_interest_income", + "long_term_capital_gains", + "short_term_capital_gains", + "non_sch_d_capital_gains", + "long_term_capital_gains_on_collectibles", + "unrecaptured_section_1250_gain", + "partnership_s_corp_income", + "self_employment_income", + "sstb_self_employment_income", + "rental_income", + "farm_income", + "farm_rent_income", + "farm_operations_income", + "estate_income", + "charitable_cash_donations", + "charitable_non_cash_donations", +) DEMOGRAPHIC_PREDICTORS = [ "age", @@ -925,6 +954,7 @@ def _run_qrf_imputation( puf_sim = Microsimulation(dataset=puf_dataset) puf_agi = puf_sim.calculate("adjusted_gross_income", map_to="person").values + puf_data = puf_sim.dataset.load_dataset() X_train_full = puf_sim.calculate_dataframe( DEMOGRAPHIC_PREDICTORS + IMPUTED_VARIABLES @@ -936,6 +966,69 @@ def _run_qrf_imputation( del puf_sim + tax_unit_ids = _period_array(puf_data, "tax_unit_id", time_period) + has_forbes_metadata = _has_forbes_metadata( + puf_data, + time_period, + expected_length=0 if tax_unit_ids is None else len(tax_unit_ids), + ) + forbes_person_mask = _forbes_person_training_mask( + puf_data, + time_period, + n_persons=len(puf_agi), + ) + if has_forbes_metadata: + top_tail_threshold = FORBES_SYNTHETIC_FINANCIAL_THRESHOLD + top_tail_label = "Forbes" + low_weight_mask = np.ones_like(forbes_person_mask, dtype=bool) + else: + top_tail_threshold = PUF_METADATA_MISSING_TOP_TAIL_THRESHOLD + top_tail_label = "metadata-missing top-tail" + low_weight_mask = np.ones_like(forbes_person_mask, dtype=bool) + + forbes_person_mask |= low_weight_mask & (puf_agi >= top_tail_threshold) + for frame in (X_train_full, X_train_override): + candidate_columns = ( + IMPUTED_VARIABLES + OVERRIDDEN_IMPUTED_VARIABLES + if has_forbes_metadata + else PUF_METADATA_MISSING_TOP_TAIL_VARIABLES + ) + financial_columns = [ + column + for column in candidate_columns + if column in frame.columns + ] + if financial_columns: + forbes_person_mask |= ( + low_weight_mask + & ( + frame[financial_columns].abs().max(axis=1).to_numpy() + >= top_tail_threshold + ) + ) + if len(forbes_person_mask) == len(puf_agi) and forbes_person_mask.any(): + if len(X_train_full) != len(forbes_person_mask) or len(X_train_override) != len( + forbes_person_mask + ): + logger.warning( + "Skipping Forbes donor exclusion because QRF training " + "frames do not match person-level PUF metadata lengths" + ) + else: + logger.info( + "Excluding %d %s person records from PUF QRF training " + "at threshold $%s", + int(forbes_person_mask.sum()), + top_tail_label, + f"{top_tail_threshold:,.0f}", + ) + non_forbes_mask = ~forbes_person_mask + puf_agi = puf_agi[non_forbes_mask] + X_train_full = X_train_full.loc[non_forbes_mask].reset_index(drop=True) + X_train_override = X_train_override.loc[non_forbes_mask].reset_index( + drop=True + ) + sub_idx = _stratified_subsample_index(puf_agi) _log_stratified_subsample( len(puf_agi), @@ -975,6 +1068,80 @@ def _run_qrf_imputation( return y_full, y_override +def _period_array( + data: Dict[str, Dict[int, np.ndarray]], + variable: str, + time_period: int, +) -> Optional[np.ndarray]: + if variable not in data: + return None + values = data[variable] + if isinstance(values, dict): + values = values.get(time_period, values.get(str(time_period))) + if values is None: + return None + return np.asarray(values) + + +def _has_forbes_metadata( + puf_data: Dict[str, Dict[int, np.ndarray]], + time_period: int, + expected_length: int, +) -> bool: + """Return whether usable Forbes synthetic-record metadata is present.""" + if expected_length <= 0: + return False + for variable in FORBES_METADATA_VARIABLES: + values = _period_array(puf_data, variable, time_period) + if values is not None and len(values) == expected_length: + return True + return False + + +def _forbes_person_training_mask( + puf_data: Dict[str, Dict[int, np.ndarray]], + time_period: int, + n_persons: int, +) -> np.ndarray: + """Return person-level mask for synthetic Forbes top-tail PUF records.""" + tax_unit_id = _period_array(puf_data, "tax_unit_id", time_period) + person_tax_unit_id = _period_array(puf_data, "person_tax_unit_id", time_period) + if tax_unit_id is None or person_tax_unit_id is None: + return np.zeros(n_persons, dtype=bool) + if len(person_tax_unit_id) != n_persons: + return np.zeros(n_persons, dtype=bool) + + tax_unit_forbes = np.zeros(len(tax_unit_id), dtype=bool) + for variable, default_threshold in ( + ("forbes_unit_id", 0), + ("forbes_replicate_id", 0), + ("forbes_rank", 1), + ): + values = _period_array(puf_data, variable, time_period) + if values is None or len(values) != len(tax_unit_id): + continue + values = np.asarray(values, dtype=float) + if default_threshold == 0: + tax_unit_forbes |= values >= 0 + else: + tax_unit_forbes |= values >= default_threshold + + if not tax_unit_forbes.any(): + return np.zeros(n_persons, dtype=bool) + + sorted_index = np.argsort(tax_unit_id) + sorted_tax_unit_id = tax_unit_id[sorted_index] + sorted_tax_unit_forbes = tax_unit_forbes[sorted_index] + + positions = np.searchsorted(sorted_tax_unit_id, person_tax_unit_id) + valid = positions < len(sorted_tax_unit_id) + person_mask = np.zeros(n_persons, dtype=bool) + valid_positions = positions[valid] + valid[valid] = sorted_tax_unit_id[valid_positions] == person_tax_unit_id[valid] + person_mask[valid] = sorted_tax_unit_forbes[positions[valid]] + return person_mask + + def _stratified_subsample_index( income: np.ndarray, target_n: int = PUF_SUBSAMPLE_TARGET, diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index 2a5f2c142..ded79eca0 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -7,6 +7,7 @@ from policyengine_us_data.utils import ( ABSOLUTE_ERROR_SCALE_TARGETS, HOUSEHOLD_COUNT_TARGET, + PUF_CLONE_HOUSEHOLD_COUNT_TARGET_SHARE, build_loss_matrix, get_target_error_normalisation, get_target_loss_weights, @@ -43,6 +44,9 @@ HOUSEHOLD_WEIGHT_TOTAL_REL_TOLERANCE = 0.02 +PUF_CLONE_HOUSEHOLD_WEIGHT_SHARE_TOLERANCE = 0.10 +PERSON_POVERTY_RATE_MIN = 0.05 +PERSON_POVERTY_RATE_MAX = 0.25 def initialize_weight_priors( @@ -126,6 +130,77 @@ def validate_household_weight_total( return weighted_hh_count +def validate_clone_household_weight_share( + weights: np.ndarray, + household_is_puf_clone: np.ndarray, + *, + year: int, + target_share: float = PUF_CLONE_HOUSEHOLD_COUNT_TARGET_SHARE, + abs_tolerance: float = PUF_CLONE_HOUSEHOLD_WEIGHT_SHARE_TOLERANCE, +) -> float: + """Validate that PUF-clone households do not dominate final weights.""" + + weights = np.asarray(weights, dtype=np.float64) + household_is_puf_clone = np.asarray(household_is_puf_clone, dtype=bool) + if len(weights) != len(household_is_puf_clone): + raise ValueError( + f"Year {year}: household_is_puf_clone length " + f"{len(household_is_puf_clone)} does not match household_weight " + f"length {len(weights)}" + ) + + total = float(weights.sum()) + if total <= 0: + raise ValueError(f"Year {year}: household_weight total must be positive") + + clone_share = float(weights[household_is_puf_clone].sum()) / total + if abs(clone_share - target_share) > abs_tolerance: + raise ValueError( + f"Year {year}: PUF-clone household weight share " + f"{clone_share:.2%} differs from target {target_share:.2%} by " + f"{abs(clone_share - target_share):.2%}, exceeding " + f"{abs_tolerance:.2%} tolerance" + ) + + return clone_share + + +def _period_array_from_loaded_dataset( + data: dict, + variable_name: str, + period: int, +) -> np.ndarray: + values_by_period = data[variable_name] + if period in values_by_period: + return values_by_period[period] + period_key = str(period) + if period_key in values_by_period: + return values_by_period[period_key] + raise KeyError(f"{variable_name}[{period}] not found in loaded dataset") + + +def validate_person_poverty_rate( + sim, + *, + year: int, + min_rate: float = PERSON_POVERTY_RATE_MIN, + max_rate: float = PERSON_POVERTY_RATE_MAX, +) -> float: + """Fail fast when calibrated weights imply an implausible poverty rate.""" + + poverty_rate = float( + sim.calc("person_in_poverty", period=year, map_to="person").mean() + ) + if not np.isfinite(poverty_rate): + raise ValueError(f"Year {year}: person poverty rate is not finite") + if not (min_rate <= poverty_rate <= max_rate): + raise ValueError( + f"Year {year}: person poverty rate {poverty_rate:.2%} outside " + f"expected range [{min_rate:.2%}, {max_rate:.2%}]" + ) + return poverty_rate + + def _to_numpy(value) -> np.ndarray: return np.asarray(getattr(value, "values", value)) @@ -733,16 +808,33 @@ def generate(self): seed=1456, ) data["household_weight"][year] = optimised_weights + sim.set_input( + "household_weight", + year, + optimised_weights.astype(np.float32), + ) weighted_hh_count = validate_household_weight_total( optimised_weights, source_total=source_household_count, year=year, ) + clone_household_share = validate_clone_household_weight_share( + optimised_weights, + _period_array_from_loaded_dataset( + data, + "household_is_puf_clone", + year, + ), + year=year, + ) + poverty_rate = validate_person_poverty_rate(sim, year=year) logging.info( f"Year {year}: weights validated — " f"{weighted_hh_count:,.0f} weighted households " f"vs {source_household_count:,.0f} source households, " + f"{clone_household_share:.1%} PUF-clone household share, " + f"{poverty_rate:.1%} person poverty rate, " f"{int(np.sum(optimised_weights > 0))} non-zero" ) diff --git a/policyengine_us_data/utils/__init__.py b/policyengine_us_data/utils/__init__.py index a0a0f2507..841bcde9e 100644 --- a/policyengine_us_data/utils/__init__.py +++ b/policyengine_us_data/utils/__init__.py @@ -10,8 +10,11 @@ __all__ = [ "ABSOLUTE_ERROR_SCALE_TARGETS", + "CPS_HOUSEHOLD_COUNT_TARGET", "HardConcrete", "HOUSEHOLD_COUNT_TARGET", + "PUF_CLONE_HOUSEHOLD_COUNT_TARGET", + "PUF_CLONE_HOUSEHOLD_COUNT_TARGET_SHARE", "build_loss_matrix", "get_target_error_normalisation", "get_target_loss_weights", diff --git a/policyengine_us_data/utils/loss.py b/policyengine_us_data/utils/loss.py index 373dc4731..9fe2116a9 100644 --- a/policyengine_us_data/utils/loss.py +++ b/policyengine_us_data/utils/loss.py @@ -83,7 +83,10 @@ BEA_NIPA_DIRECT_SUM_LOSS_WEIGHT = 1_000.0 BEA_WAGES_AND_SALARIES_LOSS_WEIGHT = 1_000.0 HOUSEHOLD_COUNT_TARGET = "nation/source/household_count" -HOUSEHOLD_COUNT_LOSS_WEIGHT = 1_000.0 +CPS_HOUSEHOLD_COUNT_TARGET = "nation/source/cps_household_count" +PUF_CLONE_HOUSEHOLD_COUNT_TARGET = "nation/source/puf_clone_household_count" +PUF_CLONE_HOUSEHOLD_COUNT_TARGET_SHARE = 0.5 +HOUSEHOLD_COUNT_LOSS_WEIGHT = 100_000.0 CBO_INCOME_BY_SOURCE_TARGETS = [ ("irs_employment_income", "employment_income"), @@ -1201,7 +1204,28 @@ def _add_transfer_balance_targets(loss_matrix, targets_list, sim, time_period): return targets_list, loss_matrix -def _add_household_count_target(loss_matrix, targets_list, sim): +def _load_household_is_puf_clone(dataset, time_period): + file_path = getattr(dataset, "file_path", None) + if file_path is None or not file_path.exists(): + return None + + import h5py + + with h5py.File(file_path, "r") as h5_file: + if "household_is_puf_clone" not in h5_file: + return None + obj = h5_file["household_is_puf_clone"] + if isinstance(obj, h5py.Dataset): + return np.asarray(obj[...], dtype=bool) + period_key = str(time_period) + if period_key in obj: + return np.asarray(obj[period_key][...], dtype=bool) + if str(int(time_period)) in obj: + return np.asarray(obj[str(int(time_period))][...], dtype=bool) + return None + + +def _add_household_count_target(loss_matrix, targets_list, sim, dataset, time_period): """Constrain total household weight to the source survey total.""" household_weights = sim.calculate("household_weight").values @@ -1223,6 +1247,27 @@ def _add_household_count_target(loss_matrix, targets_list, sim): dtype=np.float32, ) targets_list.append(target) + + household_is_puf_clone = _load_household_is_puf_clone(dataset, time_period) + if household_is_puf_clone is not None: + if len(household_is_puf_clone) != len(household_weights): + raise ValueError( + "PUF clone flag length mismatch: " + f"household_is_puf_clone has {len(household_is_puf_clone)} " + f"values but household_weight has {len(household_weights)} values" + ) + + puf_clone_target = target * PUF_CLONE_HOUSEHOLD_COUNT_TARGET_SHARE + cps_target = target - puf_clone_target + loss_matrix[CPS_HOUSEHOLD_COUNT_TARGET] = ( + ~household_is_puf_clone + ).astype(np.float32) + targets_list.append(cps_target) + loss_matrix[PUF_CLONE_HOUSEHOLD_COUNT_TARGET] = ( + household_is_puf_clone + ).astype(np.float32) + targets_list.append(puf_clone_target) + return targets_list, loss_matrix @@ -1254,7 +1299,15 @@ def get_target_loss_weights(target_names): ) | np.char.startswith(target_names, "state/bea/wages_and_salaries/") weights[is_bea_direct_sum_target] = BEA_NIPA_DIRECT_SUM_LOSS_WEIGHT weights[is_bea_wage_target] = BEA_WAGES_AND_SALARIES_LOSS_WEIGHT - weights[target_names == HOUSEHOLD_COUNT_TARGET] = HOUSEHOLD_COUNT_LOSS_WEIGHT + household_count_targets = np.isin( + target_names, + [ + HOUSEHOLD_COUNT_TARGET, + CPS_HOUSEHOLD_COUNT_TARGET, + PUF_CLONE_HOUSEHOLD_COUNT_TARGET, + ], + ) + weights[household_count_targets] = HOUSEHOLD_COUNT_LOSS_WEIGHT return weights @@ -1392,6 +1445,8 @@ def build_loss_matrix(dataset: type, time_period): loss_matrix, targets_array, sim, + dataset, + time_period, ) # Census single-year age population projections diff --git a/tests/unit/calibration/test_calibration_puf_impute.py b/tests/unit/calibration/test_calibration_puf_impute.py index 6f2f60561..485f6eba1 100644 --- a/tests/unit/calibration/test_calibration_puf_impute.py +++ b/tests/unit/calibration/test_calibration_puf_impute.py @@ -12,9 +12,11 @@ DEMOGRAPHIC_PREDICTORS, IMPUTED_VARIABLES, OVERRIDDEN_IMPUTED_VARIABLES, + _forbes_person_training_mask, _impute_retirement_contributions, _impute_weeks_unemployed, _log_stratified_subsample, + _run_qrf_imputation, _stratified_subsample_index, puf_clone_dataset, ) @@ -384,6 +386,328 @@ def test_indices_sorted(self): assert np.all(idx[1:] >= idx[:-1]) +class TestForbesTrainingExclusion: + def test_maps_forbes_tax_units_to_person_records(self): + data = { + "tax_unit_id": {2024: np.array([10, 20, 30])}, + "person_tax_unit_id": {2024: np.array([10, 20, 20, 30])}, + "forbes_unit_id": {2024: np.array([-1, 0, -1])}, + "forbes_replicate_id": {2024: np.array([-1, 3, -1])}, + "forbes_rank": {2024: np.array([0, 42, 0])}, + } + + result = _forbes_person_training_mask(data, 2024, n_persons=4) + + np.testing.assert_array_equal( + result, + np.array([False, True, True, False]), + ) + + def test_missing_forbes_metadata_keeps_all_records(self): + data = { + "tax_unit_id": {2024: np.array([10, 20])}, + "person_tax_unit_id": {2024: np.array([10, 20])}, + } + + result = _forbes_person_training_mask(data, 2024, n_persons=2) + + np.testing.assert_array_equal(result, np.array([False, False])) + + def test_qrf_training_filters_forbes_person_records(self, monkeypatch): + class FakeDataset: + def load_dataset(self): + return { + "tax_unit_id": {2024: np.array([10, 20, 30])}, + "person_tax_unit_id": {2024: np.array([10, 20, 30, 30])}, + "forbes_unit_id": {2024: np.array([-1, 0, -1])}, + "forbes_rank": {2024: np.array([0, 1, 0])}, + "forbes_replicate_id": {2024: np.array([-1, 0, -1])}, + } + + class FakeMicrosimulation: + def __init__(self, dataset): + self.dataset = FakeDataset() + + def calculate(self, variable, map_to=None): + assert map_to == "person" + assert variable == "adjusted_gross_income" + return pd.Series([10.0, 30_000_000.0, 20.0, 30.0]) + + def calculate_dataframe(self, columns): + frame = pd.DataFrame({"age": [40.0, 99.0, 50.0, 55.0]}) + for column in columns: + if column not in frame: + frame[column] = 0.0 + return frame[list(columns)] + + train_frames = [] + + def fake_sequential_qrf(X_train, X_test, predictors, output_vars): + train_frames.append(X_train.copy()) + return {variable: np.zeros(len(X_test)) for variable in output_vars} + + monkeypatch.setattr("policyengine_us.Microsimulation", FakeMicrosimulation) + monkeypatch.setattr( + puf_impute_module, + "_sequential_qrf", + fake_sequential_qrf, + ) + + data = { + predictor: {2024: np.array([0.0, 1.0])} + for predictor in DEMOGRAPHIC_PREDICTORS + } + _run_qrf_imputation( + data=data, + time_period=2024, + puf_dataset=object(), + dataset_path=None, + ) + + assert len(train_frames) == 2 + assert all(len(frame) == 3 for frame in train_frames) + assert all(99.0 not in set(frame["age"]) for frame in train_frames) + + def test_qrf_training_filters_synthetic_top_tail_without_metadata( + self, monkeypatch + ): + class FakeDataset: + def load_dataset(self): + return { + "tax_unit_id": {2024: np.array([10, 20, 30])}, + "person_tax_unit_id": {2024: np.array([10, 20, 30, 30])}, + } + + class FakeMicrosimulation: + def __init__(self, dataset): + self.dataset = FakeDataset() + + def calculate(self, variable, map_to=None): + assert map_to == "person" + if variable == "person_weight": + return pd.Series([100.0, 0.13, 100.0, 100.0]) + assert variable == "adjusted_gross_income" + return pd.Series([10.0, 1_000_000_000.0, 20.0, 30.0]) + + def calculate_dataframe(self, columns): + frame = pd.DataFrame({"age": [40.0, 99.0, 50.0, 55.0]}) + for column in columns: + if column not in frame: + frame[column] = 0.0 + return frame[list(columns)] + + train_frames = [] + + def fake_sequential_qrf(X_train, X_test, predictors, output_vars): + train_frames.append(X_train.copy()) + return {variable: np.zeros(len(X_test)) for variable in output_vars} + + monkeypatch.setattr("policyengine_us.Microsimulation", FakeMicrosimulation) + monkeypatch.setattr( + puf_impute_module, + "_sequential_qrf", + fake_sequential_qrf, + ) + + data = { + predictor: {2024: np.array([0.0, 1.0])} + for predictor in DEMOGRAPHIC_PREDICTORS + } + _run_qrf_imputation( + data=data, + time_period=2024, + puf_dataset=object(), + dataset_path=None, + ) + + assert len(train_frames) == 2 + assert all(len(frame) == 3 for frame in train_frames) + assert all(99.0 not in set(frame["age"]) for frame in train_frames) + + def test_qrf_training_filters_synthetic_top_tail_components_without_metadata( + self, monkeypatch + ): + class FakeDataset: + def load_dataset(self): + return { + "tax_unit_id": {2024: np.array([10, 20, 30])}, + "person_tax_unit_id": {2024: np.array([10, 20, 30, 30])}, + } + + class FakeMicrosimulation: + def __init__(self, dataset): + self.dataset = FakeDataset() + + def calculate(self, variable, map_to=None): + assert map_to == "person" + assert variable == "adjusted_gross_income" + return pd.Series([10.0, 20.0, 30.0, 40.0]) + + def calculate_dataframe(self, columns): + frame = pd.DataFrame( + { + "age": [40.0, 99.0, 50.0, 55.0], + "long_term_capital_gains": [ + 0.0, + 30_000_000.0, + 0.0, + 0.0, + ], + } + ) + for column in columns: + if column not in frame: + frame[column] = 0.0 + return frame[list(columns)] + + train_frames = [] + + def fake_sequential_qrf(X_train, X_test, predictors, output_vars): + train_frames.append(X_train.copy()) + return {variable: np.zeros(len(X_test)) for variable in output_vars} + + monkeypatch.setattr("policyengine_us.Microsimulation", FakeMicrosimulation) + monkeypatch.setattr( + puf_impute_module, + "_sequential_qrf", + fake_sequential_qrf, + ) + + data = { + predictor: {2024: np.array([0.0, 1.0])} + for predictor in DEMOGRAPHIC_PREDICTORS + } + _run_qrf_imputation( + data=data, + time_period=2024, + puf_dataset=object(), + dataset_path=None, + ) + + assert len(train_frames) == 2 + assert all(len(frame) == 3 for frame in train_frames) + assert all(99.0 not in set(frame["age"]) for frame in train_frames) + + def test_qrf_training_filters_normal_weight_top_tail_without_metadata( + self, monkeypatch + ): + class FakeDataset: + def load_dataset(self): + return { + "tax_unit_id": {2024: np.array([10, 20, 30])}, + "person_tax_unit_id": {2024: np.array([10, 20, 30, 30])}, + } + + class FakeMicrosimulation: + def __init__(self, dataset): + self.dataset = FakeDataset() + + def calculate(self, variable, map_to=None): + assert map_to == "person" + assert variable == "adjusted_gross_income" + return pd.Series([10.0, 30_000_000.0, 20.0, 30.0]) + + def calculate_dataframe(self, columns): + frame = pd.DataFrame({"age": [40.0, 99.0, 50.0, 55.0]}) + for column in columns: + if column not in frame: + frame[column] = 0.0 + return frame[list(columns)] + + train_frames = [] + + def fake_sequential_qrf(X_train, X_test, predictors, output_vars): + train_frames.append(X_train.copy()) + return {variable: np.zeros(len(X_test)) for variable in output_vars} + + monkeypatch.setattr("policyengine_us.Microsimulation", FakeMicrosimulation) + monkeypatch.setattr( + puf_impute_module, + "_sequential_qrf", + fake_sequential_qrf, + ) + + data = { + predictor: {2024: np.array([0.0, 1.0])} + for predictor in DEMOGRAPHIC_PREDICTORS + } + _run_qrf_imputation( + data=data, + time_period=2024, + puf_dataset=object(), + dataset_path=None, + ) + + assert len(train_frames) == 2 + assert all(len(frame) == 3 for frame in train_frames) + assert all(99.0 not in set(frame["age"]) for frame in train_frames) + + def test_qrf_training_keeps_non_forbes_top_tail_with_metadata( + self, monkeypatch + ): + class FakeDataset: + def load_dataset(self): + return { + "tax_unit_id": {2024: np.array([10, 20, 30])}, + "person_tax_unit_id": {2024: np.array([10, 20, 30, 30])}, + "forbes_unit_id": {2024: np.array([-1, -1, -1])}, + } + + class FakeMicrosimulation: + def __init__(self, dataset): + self.dataset = FakeDataset() + + def calculate(self, variable, map_to=None): + assert map_to == "person" + assert variable == "adjusted_gross_income" + return pd.Series([10.0, 20.0, 30.0, 40.0]) + + def calculate_dataframe(self, columns): + frame = pd.DataFrame( + { + "age": [40.0, 99.0, 50.0, 55.0], + "long_term_capital_gains": [ + 0.0, + 30_000_000.0, + 0.0, + 0.0, + ], + } + ) + for column in columns: + if column not in frame: + frame[column] = 0.0 + return frame[list(columns)] + + train_frames = [] + + def fake_sequential_qrf(X_train, X_test, predictors, output_vars): + train_frames.append(X_train.copy()) + return {variable: np.zeros(len(X_test)) for variable in output_vars} + + monkeypatch.setattr("policyengine_us.Microsimulation", FakeMicrosimulation) + monkeypatch.setattr( + puf_impute_module, + "_sequential_qrf", + fake_sequential_qrf, + ) + + data = { + predictor: {2024: np.array([0.0, 1.0])} + for predictor in DEMOGRAPHIC_PREDICTORS + } + _run_qrf_imputation( + data=data, + time_period=2024, + puf_dataset=object(), + dataset_path=None, + ) + + assert len(train_frames) == 2 + assert all(len(frame) == 4 for frame in train_frames) + assert all(99.0 in set(frame["age"]) for frame in train_frames) + + def test_retirement_imputation_uses_sstb_income_for_se_eligibility(monkeypatch): class FakeMicrosimulation: def __init__(self, dataset): diff --git a/tests/unit/calibration/test_loss_targets.py b/tests/unit/calibration/test_loss_targets.py index 2cacbbc9a..544bb9715 100644 --- a/tests/unit/calibration/test_loss_targets.py +++ b/tests/unit/calibration/test_loss_targets.py @@ -1,6 +1,7 @@ import inspect from types import SimpleNamespace +import h5py import numpy as np import pandas as pd import pytest @@ -14,11 +15,13 @@ BEA_NIPA_DIRECT_SUM_TARGETS, BEA_NIPA_DIRECT_SUM_LOSS_WEIGHT, BEA_WAGES_AND_SALARIES_LOSS_WEIGHT, + CPS_HOUSEHOLD_COUNT_TARGET, BLS_CE_TOTALS, HARD_CODED_TOTALS, HOUSEHOLD_COUNT_LOSS_WEIGHT, HOUSEHOLD_COUNT_TARGET, LOW_AGI_INVESTMENT_INCOME_SOI_VARIABLES, + PUF_CLONE_HOUSEHOLD_COUNT_TARGET, SOI_NEGATIVE_AGI_TARGETED_VARIABLES, TRANSFER_BALANCE_TARGETS, _add_bea_state_wage_targets, @@ -174,6 +177,8 @@ def test_household_count_target_gets_higher_loss_weight(): target_names = np.array( [ HOUSEHOLD_COUNT_TARGET, + CPS_HOUSEHOLD_COUNT_TARGET, + PUF_CLONE_HOUSEHOLD_COUNT_TARGET, "nation/census/population_by_age/0", ] ) @@ -181,6 +186,8 @@ def test_household_count_target_gets_higher_loss_weight(): weights = get_target_loss_weights(target_names) assert weights.tolist() == [ + HOUSEHOLD_COUNT_LOSS_WEIGHT, + HOUSEHOLD_COUNT_LOSS_WEIGHT, HOUSEHOLD_COUNT_LOSS_WEIGHT, 1.0, ] @@ -464,6 +471,8 @@ def test_add_household_count_target_uses_source_weight_total(): loss_matrix, [], _FakeHouseholdWeightSimulation([80.0, 20.0, 0.0, 0.0]), + SimpleNamespace(), + 2024, ) assert targets == [100.0] @@ -473,6 +482,33 @@ def test_add_household_count_target_uses_source_weight_total(): ) +def test_add_household_count_target_adds_clone_split_targets(tmp_path): + file_path = tmp_path / "extended_cps_2024.h5" + with h5py.File(file_path, "w") as h5_file: + group = h5_file.create_group("household_is_puf_clone") + group.create_dataset("2024", data=np.array([0, 0, 1, 1], dtype=np.int8)) + + loss_matrix = pd.DataFrame(index=[101, 102, 103, 104]) + + targets, loss_matrix = _add_household_count_target( + loss_matrix, + [], + _FakeHouseholdWeightSimulation([80.0, 20.0, 0.0, 0.0]), + SimpleNamespace(file_path=file_path), + 2024, + ) + + assert targets == [100.0, 50.0, 50.0] + np.testing.assert_array_equal( + loss_matrix[CPS_HOUSEHOLD_COUNT_TARGET].to_numpy(), + np.array([1.0, 1.0, 0.0, 0.0], dtype=np.float32), + ) + np.testing.assert_array_equal( + loss_matrix[PUF_CLONE_HOUSEHOLD_COUNT_TARGET].to_numpy(), + np.array([0.0, 0.0, 1.0, 1.0], dtype=np.float32), + ) + + def test_build_loss_matrix_adds_household_count_target_before_reweighting(): source = inspect.getsource(build_loss_matrix) diff --git a/tests/unit/datasets/test_enhanced_cps_seeding.py b/tests/unit/datasets/test_enhanced_cps_seeding.py index 2e386f942..e81f44ab9 100644 --- a/tests/unit/datasets/test_enhanced_cps_seeding.py +++ b/tests/unit/datasets/test_enhanced_cps_seeding.py @@ -84,3 +84,69 @@ def test_validate_household_weight_total_rejects_inflated_total(): source_total=145_000_000.0, year=2024, ) + + +def test_validate_clone_household_weight_share_accepts_target_share(): + from policyengine_us_data.datasets.cps.enhanced_cps import ( + validate_clone_household_weight_share, + ) + + share = validate_clone_household_weight_share( + np.array([40_000_000.0, 10_000_000.0, 25_000_000.0, 25_000_000.0]), + np.array([False, False, True, True]), + year=2024, + ) + + assert share == pytest.approx(0.5) + + +def test_validate_clone_household_weight_share_rejects_clone_dominance(): + from policyengine_us_data.datasets.cps.enhanced_cps import ( + validate_clone_household_weight_share, + ) + + with pytest.raises(ValueError, match="PUF-clone household weight share"): + validate_clone_household_weight_share( + np.array([10_000_000.0, 10_000_000.0, 40_000_000.0, 40_000_000.0]), + np.array([False, False, True, True]), + year=2024, + ) + + +def test_validate_person_poverty_rate_accepts_reasonable_rate(): + from policyengine_us_data.datasets.cps.enhanced_cps import ( + validate_person_poverty_rate, + ) + + class FakePoverty: + def mean(self): + return 0.12 + + class FakeSimulation: + def calc(self, variable, period, map_to): + assert variable == "person_in_poverty" + assert period == 2024 + assert map_to == "person" + return FakePoverty() + + assert validate_person_poverty_rate(FakeSimulation(), year=2024) == 0.12 + + +def test_validate_person_poverty_rate_rejects_implausible_rate(): + from policyengine_us_data.datasets.cps.enhanced_cps import ( + validate_person_poverty_rate, + ) + + class FakePoverty: + def mean(self): + return 0.39 + + class FakeSimulation: + def calc(self, variable, period, map_to): + assert variable == "person_in_poverty" + assert period == 2024 + assert map_to == "person" + return FakePoverty() + + with pytest.raises(ValueError, match="person poverty rate"): + validate_person_poverty_rate(FakeSimulation(), year=2024) From 70541bc6f1c4d6abd3efd8da07ae479e66142b92 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Thu, 28 May 2026 22:29:12 -0400 Subject: [PATCH 3/4] Refresh policyengine-us dependency --- policyengine_us_data/calibration/puf_impute.py | 16 +++++----------- policyengine_us_data/utils/loss.py | 12 ++++++------ pyproject.toml | 2 +- .../calibration/test_calibration_puf_impute.py | 4 +--- uv.lock | 8 ++++---- 5 files changed, 17 insertions(+), 25 deletions(-) diff --git a/policyengine_us_data/calibration/puf_impute.py b/policyengine_us_data/calibration/puf_impute.py index f8c500e68..e9ded16c5 100644 --- a/policyengine_us_data/calibration/puf_impute.py +++ b/policyengine_us_data/calibration/puf_impute.py @@ -994,17 +994,12 @@ def _run_qrf_imputation( else PUF_METADATA_MISSING_TOP_TAIL_VARIABLES ) financial_columns = [ - column - for column in candidate_columns - if column in frame.columns + column for column in candidate_columns if column in frame.columns ] if financial_columns: - forbes_person_mask |= ( - low_weight_mask - & ( - frame[financial_columns].abs().max(axis=1).to_numpy() - >= top_tail_threshold - ) + forbes_person_mask |= low_weight_mask & ( + frame[financial_columns].abs().max(axis=1).to_numpy() + >= top_tail_threshold ) if len(forbes_person_mask) == len(puf_agi) and forbes_person_mask.any(): if len(X_train_full) != len(forbes_person_mask) or len(X_train_override) != len( @@ -1016,8 +1011,7 @@ def _run_qrf_imputation( ) else: logger.info( - "Excluding %d %s person records from PUF QRF training " - "at threshold $%s", + "Excluding %d %s person records from PUF QRF training at threshold $%s", int(forbes_person_mask.sum()), top_tail_label, f"{top_tail_threshold:,.0f}", diff --git a/policyengine_us_data/utils/loss.py b/policyengine_us_data/utils/loss.py index 9fe2116a9..e090eb5b8 100644 --- a/policyengine_us_data/utils/loss.py +++ b/policyengine_us_data/utils/loss.py @@ -1259,13 +1259,13 @@ def _add_household_count_target(loss_matrix, targets_list, sim, dataset, time_pe puf_clone_target = target * PUF_CLONE_HOUSEHOLD_COUNT_TARGET_SHARE cps_target = target - puf_clone_target - loss_matrix[CPS_HOUSEHOLD_COUNT_TARGET] = ( - ~household_is_puf_clone - ).astype(np.float32) + loss_matrix[CPS_HOUSEHOLD_COUNT_TARGET] = (~household_is_puf_clone).astype( + np.float32 + ) targets_list.append(cps_target) - loss_matrix[PUF_CLONE_HOUSEHOLD_COUNT_TARGET] = ( - household_is_puf_clone - ).astype(np.float32) + loss_matrix[PUF_CLONE_HOUSEHOLD_COUNT_TARGET] = (household_is_puf_clone).astype( + np.float32 + ) targets_list.append(puf_clone_target) return targets_list, loss_matrix diff --git a/pyproject.toml b/pyproject.toml index 4e3259399..2c78fe309 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,7 +22,7 @@ classifiers = [ "Programming Language :: Python :: 3.14", ] dependencies = [ - "policyengine-us==1.712.0", + "policyengine-us==1.715.1", # policyengine-core 3.26.1 is the current 3.26.x runtime and includes the fix for # PolicyEngine/policyengine-core#482 (user-set ETERNITY inputs lost # after _invalidate_all_caches) and is required by policyengine-us 1.682.1+. diff --git a/tests/unit/calibration/test_calibration_puf_impute.py b/tests/unit/calibration/test_calibration_puf_impute.py index 485f6eba1..627529804 100644 --- a/tests/unit/calibration/test_calibration_puf_impute.py +++ b/tests/unit/calibration/test_calibration_puf_impute.py @@ -642,9 +642,7 @@ def fake_sequential_qrf(X_train, X_test, predictors, output_vars): assert all(len(frame) == 3 for frame in train_frames) assert all(99.0 not in set(frame["age"]) for frame in train_frames) - def test_qrf_training_keeps_non_forbes_top_tail_with_metadata( - self, monkeypatch - ): + def test_qrf_training_keeps_non_forbes_top_tail_with_metadata(self, monkeypatch): class FakeDataset: def load_dataset(self): return { diff --git a/uv.lock b/uv.lock index 891d69404..5f72f33f5 100644 --- a/uv.lock +++ b/uv.lock @@ -2164,7 +2164,7 @@ wheels = [ [[package]] name = "policyengine-us" -version = "1.712.0" +version = "1.715.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "microdf-python" }, @@ -2174,9 +2174,9 @@ dependencies = [ { name = "tables" }, { name = "tqdm" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/94/de/1c6ed33b769f12a29ee148eaf73399fa11b48d726f8920b513657c4ef2f5/policyengine_us-1.712.0.tar.gz", hash = "sha256:821f9d25fa1893d1e95b090868983f56281a448eec82bb7b11ec5d08814c8e39", size = 9957228, upload-time = "2026-05-27T22:52:32.555Z" } +sdist = { url = "https://files.pythonhosted.org/packages/62/92/2df510ffc7b6c7302bf85ac9a48c126734db7c329197be6eb516d1037210/policyengine_us-1.715.1.tar.gz", hash = "sha256:88222f88cbbaf24f30bc226fd74ec25429e4acaaefb2c0372a70d9763566d8e5", size = 10007007, upload-time = "2026-05-28T16:31:00.199Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/03/b0/0073cf07946c52ab7c7098e4570362bfa4c285070014a530aee02bfd452d/policyengine_us-1.712.0-py3-none-any.whl", hash = "sha256:618450b2eca7b15ff73530d0f67fa4d1064104c46d155fc3ecc7f40b1c178956", size = 10887739, upload-time = "2026-05-27T22:52:27.786Z" }, + { url = "https://files.pythonhosted.org/packages/af/6c/d1fcc845d3df1718c4c09201c84cb323de3a64436dd16485026b421733a7/policyengine_us-1.715.1-py3-none-any.whl", hash = "sha256:c3a3bfea48ca90e46ef2aae30daadd6e7b587cbea565b2e005516acac6678606", size = 11018778, upload-time = "2026-05-28T16:30:56.95Z" }, ] [[package]] @@ -2246,7 +2246,7 @@ requires-dist = [ { name = "pandas", specifier = ">=2.3.1" }, { name = "pip-system-certs", specifier = ">=3.0" }, { name = "policyengine-core", specifier = ">=3.26.1,<3.27" }, - { name = "policyengine-us", specifier = "==1.712.0" }, + { name = "policyengine-us", specifier = "==1.715.1" }, { name = "requests", specifier = ">=2.25.0" }, { name = "scipy", specifier = ">=1.15.3" }, { name = "setuptools", specifier = ">=60" }, From 192edf1e58dd139170e6cdbd90b178b7155bf4ff Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Fri, 29 May 2026 08:53:42 -0400 Subject: [PATCH 4/4] Fix ECPS calibration guard edge cases --- .../calibration/puf_impute.py | 21 +++---- .../datasets/cps/enhanced_cps.py | 2 +- policyengine_us_data/utils/loss.py | 6 +- .../test_calibration_puf_impute.py | 61 ++++++++++++++++++- tests/unit/calibration/test_loss_targets.py | 25 ++++++++ .../datasets/test_enhanced_cps_seeding.py | 4 +- 6 files changed, 102 insertions(+), 17 deletions(-) diff --git a/policyengine_us_data/calibration/puf_impute.py b/policyengine_us_data/calibration/puf_impute.py index e9ded16c5..de7498178 100644 --- a/policyengine_us_data/calibration/puf_impute.py +++ b/policyengine_us_data/calibration/puf_impute.py @@ -42,10 +42,10 @@ PUF_TOP_PERCENTILE = 99.5 FORBES_SYNTHETIC_FINANCIAL_THRESHOLD = 250_000_000 PUF_METADATA_MISSING_TOP_TAIL_THRESHOLD = 10_000_000 -FORBES_METADATA_VARIABLES = ( - "forbes_unit_id", - "forbes_replicate_id", - "forbes_rank", +FORBES_METADATA_MARKER_THRESHOLDS = ( + ("forbes_unit_id", 0), + ("forbes_replicate_id", 0), + ("forbes_rank", 1), ) PUF_METADATA_MISSING_TOP_TAIL_VARIABLES = ( "adjusted_gross_income", @@ -1085,9 +1085,12 @@ def _has_forbes_metadata( """Return whether usable Forbes synthetic-record metadata is present.""" if expected_length <= 0: return False - for variable in FORBES_METADATA_VARIABLES: + for variable, marker_threshold in FORBES_METADATA_MARKER_THRESHOLDS: values = _period_array(puf_data, variable, time_period) - if values is not None and len(values) == expected_length: + if values is None or len(values) != expected_length: + continue + values = np.asarray(values, dtype=float) + if np.any(values >= marker_threshold): return True return False @@ -1106,11 +1109,7 @@ def _forbes_person_training_mask( return np.zeros(n_persons, dtype=bool) tax_unit_forbes = np.zeros(len(tax_unit_id), dtype=bool) - for variable, default_threshold in ( - ("forbes_unit_id", 0), - ("forbes_replicate_id", 0), - ("forbes_rank", 1), - ): + for variable, default_threshold in FORBES_METADATA_MARKER_THRESHOLDS: values = _period_array(puf_data, variable, time_period) if values is None or len(values) != len(tax_unit_id): continue diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index ded79eca0..1d1a852ca 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -189,7 +189,7 @@ def validate_person_poverty_rate( """Fail fast when calibrated weights imply an implausible poverty rate.""" poverty_rate = float( - sim.calc("person_in_poverty", period=year, map_to="person").mean() + sim.calculate("person_in_poverty", period=year, map_to="person").mean() ) if not np.isfinite(poverty_rate): raise ValueError(f"Year {year}: person poverty rate is not finite") diff --git a/policyengine_us_data/utils/loss.py b/policyengine_us_data/utils/loss.py index e090eb5b8..b143824c0 100644 --- a/policyengine_us_data/utils/loss.py +++ b/policyengine_us_data/utils/loss.py @@ -4,6 +4,7 @@ import numpy as np import logging import sqlite3 +from pathlib import Path from policyengine_us_data.storage import CALIBRATION_FOLDER, STORAGE_FOLDER from policyengine_us_data.storage.calibration_targets.pull_soi_targets import ( @@ -1206,7 +1207,10 @@ def _add_transfer_balance_targets(loss_matrix, targets_list, sim, time_period): def _load_household_is_puf_clone(dataset, time_period): file_path = getattr(dataset, "file_path", None) - if file_path is None or not file_path.exists(): + if file_path is None: + return None + file_path = Path(file_path) + if not file_path.exists(): return None import h5py diff --git a/tests/unit/calibration/test_calibration_puf_impute.py b/tests/unit/calibration/test_calibration_puf_impute.py index 627529804..de6fa4c90 100644 --- a/tests/unit/calibration/test_calibration_puf_impute.py +++ b/tests/unit/calibration/test_calibration_puf_impute.py @@ -642,13 +642,70 @@ def fake_sequential_qrf(X_train, X_test, predictors, output_vars): assert all(len(frame) == 3 for frame in train_frames) assert all(99.0 not in set(frame["age"]) for frame in train_frames) - def test_qrf_training_keeps_non_forbes_top_tail_with_metadata(self, monkeypatch): + def test_qrf_training_filters_all_default_metadata_top_tail(self, monkeypatch): class FakeDataset: def load_dataset(self): return { "tax_unit_id": {2024: np.array([10, 20, 30])}, "person_tax_unit_id": {2024: np.array([10, 20, 30, 30])}, "forbes_unit_id": {2024: np.array([-1, -1, -1])}, + "forbes_rank": {2024: np.array([0, 0, 0])}, + "forbes_replicate_id": {2024: np.array([-1, -1, -1])}, + } + + class FakeMicrosimulation: + def __init__(self, dataset): + self.dataset = FakeDataset() + + def calculate(self, variable, map_to=None): + assert map_to == "person" + assert variable == "adjusted_gross_income" + return pd.Series([10.0, 30_000_000.0, 20.0, 30.0]) + + def calculate_dataframe(self, columns): + frame = pd.DataFrame({"age": [40.0, 99.0, 50.0, 55.0]}) + for column in columns: + if column not in frame: + frame[column] = 0.0 + return frame[list(columns)] + + train_frames = [] + + def fake_sequential_qrf(X_train, X_test, predictors, output_vars): + train_frames.append(X_train.copy()) + return {variable: np.zeros(len(X_test)) for variable in output_vars} + + monkeypatch.setattr("policyengine_us.Microsimulation", FakeMicrosimulation) + monkeypatch.setattr( + puf_impute_module, + "_sequential_qrf", + fake_sequential_qrf, + ) + + data = { + predictor: {2024: np.array([0.0, 1.0])} + for predictor in DEMOGRAPHIC_PREDICTORS + } + _run_qrf_imputation( + data=data, + time_period=2024, + puf_dataset=object(), + dataset_path=None, + ) + + assert len(train_frames) == 2 + assert all(len(frame) == 3 for frame in train_frames) + assert all(99.0 not in set(frame["age"]) for frame in train_frames) + + def test_qrf_training_keeps_non_forbes_top_tail_with_metadata(self, monkeypatch): + class FakeDataset: + def load_dataset(self): + return { + "tax_unit_id": {2024: np.array([10, 20, 30])}, + "person_tax_unit_id": {2024: np.array([10, 20, 30, 30])}, + "forbes_unit_id": {2024: np.array([-1, -1, 0])}, + "forbes_rank": {2024: np.array([0, 0, 1])}, + "forbes_replicate_id": {2024: np.array([-1, -1, 0])}, } class FakeMicrosimulation: @@ -702,7 +759,7 @@ def fake_sequential_qrf(X_train, X_test, predictors, output_vars): ) assert len(train_frames) == 2 - assert all(len(frame) == 4 for frame in train_frames) + assert all(len(frame) == 2 for frame in train_frames) assert all(99.0 in set(frame["age"]) for frame in train_frames) diff --git a/tests/unit/calibration/test_loss_targets.py b/tests/unit/calibration/test_loss_targets.py index 544bb9715..9ec8032b9 100644 --- a/tests/unit/calibration/test_loss_targets.py +++ b/tests/unit/calibration/test_loss_targets.py @@ -509,6 +509,31 @@ def test_add_household_count_target_adds_clone_split_targets(tmp_path): ) +def test_add_household_count_target_accepts_string_dataset_path(tmp_path): + file_path = tmp_path / "extended_cps_2024.h5" + with h5py.File(file_path, "w") as h5_file: + group = h5_file.create_group("household_is_puf_clone") + group.create_dataset("2024", data=np.array([0, 1], dtype=np.int8)) + + targets, loss_matrix = _add_household_count_target( + pd.DataFrame(index=[101, 102]), + [], + _FakeHouseholdWeightSimulation([80.0, 20.0]), + SimpleNamespace(file_path=str(file_path)), + 2024, + ) + + assert targets == [100.0, 50.0, 50.0] + np.testing.assert_array_equal( + loss_matrix[CPS_HOUSEHOLD_COUNT_TARGET].to_numpy(), + np.array([1.0, 0.0], dtype=np.float32), + ) + np.testing.assert_array_equal( + loss_matrix[PUF_CLONE_HOUSEHOLD_COUNT_TARGET].to_numpy(), + np.array([0.0, 1.0], dtype=np.float32), + ) + + def test_build_loss_matrix_adds_household_count_target_before_reweighting(): source = inspect.getsource(build_loss_matrix) diff --git a/tests/unit/datasets/test_enhanced_cps_seeding.py b/tests/unit/datasets/test_enhanced_cps_seeding.py index e81f44ab9..de875ad3e 100644 --- a/tests/unit/datasets/test_enhanced_cps_seeding.py +++ b/tests/unit/datasets/test_enhanced_cps_seeding.py @@ -123,7 +123,7 @@ def mean(self): return 0.12 class FakeSimulation: - def calc(self, variable, period, map_to): + def calculate(self, variable, period, map_to): assert variable == "person_in_poverty" assert period == 2024 assert map_to == "person" @@ -142,7 +142,7 @@ def mean(self): return 0.39 class FakeSimulation: - def calc(self, variable, period, map_to): + def calculate(self, variable, period, map_to): assert variable == "person_in_poverty" assert period == 2024 assert map_to == "person"