From 7bd461ef8729139d009fa947e5f049d2572376ed Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sun, 24 May 2026 13:00:07 -0400 Subject: [PATCH] Mask missing LA calibration targets --- .../local_areas/local_authorities/loss.py | 33 ++-- .../tests/test_la_loss_council_tax.py | 6 +- .../tests/test_la_loss_missing_sources.py | 182 ++++++++++++++++++ 3 files changed, 198 insertions(+), 23 deletions(-) create mode 100644 policyengine_uk_data/tests/test_la_loss_missing_sources.py diff --git a/policyengine_uk_data/datasets/local_areas/local_authorities/loss.py b/policyengine_uk_data/datasets/local_areas/local_authorities/loss.py index 0993c53c5..d2455e22f 100644 --- a/policyengine_uk_data/datasets/local_areas/local_authorities/loss.py +++ b/policyengine_uk_data/datasets/local_areas/local_authorities/loss.py @@ -14,6 +14,12 @@ - Council tax bands A-H: VOA Council Tax Stock of Properties (per LA) - Council tax £ paid (net of CTR): MHCLG taxbase × Band D (England), Welsh Government Council Tax Income (Wales) + +Missing-source policy: local target cells stay NaN when no direct LA +source is available. The local-area calibrator masks those cells out of +the local loss. National targets are supplied by a separate national +target matrix, so this module should not fabricate local targets by +allocating national totals across missing-source LAs. """ from policyengine_uk import Microsimulation @@ -55,7 +61,6 @@ def create_local_authority_target_matrix( sim = Microsimulation(dataset=dataset, reform=reform) sim.default_calculation_period = time_period - original_weights = sim.calculate("household_weight", time_period).values matrix = pd.DataFrame() y = pd.DataFrame() @@ -154,31 +159,20 @@ def create_local_authority_target_matrix( has_ons_data = ( ons_merged["net_income_bhc"].notna() & ons_merged["households"].notna() ).values - total_households = ons_merged["households"].sum() - la_household_share = np.where( - ons_merged["households"].notna(), - ons_merged["households"].values / total_households, - 1 / len(la_codes), - ) - - national_bhc = (original_weights * hbai_net_income).sum() - national_ahc = (original_weights * hbai_net_income_ahc).sum() - national_hc = (original_weights * housing_costs).sum() - y["ons/equiv_net_income_bhc"] = np.where( has_ons_data, ons_merged["equiv_net_income_bhc_target"].values, - national_bhc * la_household_share, + np.nan, ) y["ons/equiv_net_income_ahc"] = np.where( has_ons_data, ons_merged["equiv_net_income_ahc_target"].values, - national_ahc * la_household_share, + np.nan, ) y["ons/equiv_housing_costs"] = np.where( has_ons_data, ons_merged["equiv_housing_costs_target"].values, - national_hc * la_household_share, + np.nan, ) # ── Tenure targets ───────────────────────────────────────────── @@ -216,9 +210,10 @@ def create_local_authority_target_matrix( ("social_rent", "social_rent_pct"), ]: targets = tenure_merged[pct_col] / 100 * tenure_merged["households"] - national = (original_weights * matrix[f"tenure/{tenure_key}"].values).sum() y[f"tenure/{tenure_key}"] = np.where( - has_tenure, targets.values, national * la_household_share + has_tenure, + targets.values, + np.nan, ) # ── Private rent amounts ─────────────────────────────────────── @@ -247,12 +242,10 @@ def create_local_authority_target_matrix( & tenure_merged["private_rent_pct"].notna() & tenure_merged["households"].notna() ).values - national_rent = (original_weights * private_rent_amount).sum() - y["rent/private_rent"] = np.where( has_rent, tenure_merged["private_rent_target"].values, - national_rent * la_household_share, + np.nan, ) # ── Council tax band counts (LA targets) ─────────────────────── diff --git a/policyengine_uk_data/tests/test_la_loss_council_tax.py b/policyengine_uk_data/tests/test_la_loss_council_tax.py index c33949e06..c36574547 100644 --- a/policyengine_uk_data/tests/test_la_loss_council_tax.py +++ b/policyengine_uk_data/tests/test_la_loss_council_tax.py @@ -49,8 +49,8 @@ def test_band_count_columns_exist_for_every_wired_band(): def test_england_and_wales_have_band_a_to_h_populated(): """E/W rows should have non-null counts for A-H. If the CSV regresses - to NaN there, the loss matrix will silently fall back to the - national-share estimate and the calibrator loses its real signal.""" + to NaN there, the loss matrix will mask the cell and the calibrator + loses its real signal.""" ew = CT_DATA[CT_DATA["country"].isin(["ENGLAND", "WALES"])] for band in WIRED_BANDS: non_null = ew[f"count_band_{band}"].notna().sum() @@ -62,7 +62,7 @@ def test_england_and_wales_have_band_a_to_h_populated(): def test_scotland_band_counts_are_null_as_documented(): """Scotland VOA band counts are absent — they should consistently be - NaN so the loss matrix routes them through the fallback.""" + NaN so the loss matrix masks them.""" scotland = CT_DATA[CT_DATA["country"] == "SCOTLAND"] for band in WIRED_BANDS: assert scotland[f"count_band_{band}"].isna().all(), ( diff --git a/policyengine_uk_data/tests/test_la_loss_missing_sources.py b/policyengine_uk_data/tests/test_la_loss_missing_sources.py new file mode 100644 index 000000000..11f078fe2 --- /dev/null +++ b/policyengine_uk_data/tests/test_la_loss_missing_sources.py @@ -0,0 +1,182 @@ +import numpy as np +import pandas as pd + + +class _FakeDataset: + time_period = 2025 + + +class _FakeSim: + def __init__(self, *args, **kwargs): + self.default_calculation_period = 2025 + + def calculate(self, variable, *args, **kwargs): + values = { + "employment_income": np.array([10_000.0, 30_000.0]), + "income_tax": np.array([1.0, 1.0]), + "age": np.array([40, 70]), + "universal_credit": np.array([0.0, 1.0]), + "equiv_hbai_household_net_income": np.array([20_000.0, 25_000.0]), + "equiv_hbai_household_net_income_ahc": np.array([18_000.0, 22_000.0]), + "tenure_type": np.array(["RENT_PRIVATELY", "OWNED_OUTRIGHT"]), + "benunit_rent": np.array([12_000.0, 0.0]), + "country": np.array(["ENGLAND", "SCOTLAND"]), + } + return type("Result", (), {"values": values[variable]})() + + def map_result(self, values, source_entity, target_entity): + return np.asarray(values) + + +def _fake_la_codes(): + return pd.DataFrame( + { + "code": ["E06000001", "W06000001", "S12000001", "N09000001"], + } + ) + + +def _patch_common_la_inputs(monkeypatch, tmp_path): + from policyengine_uk_data.datasets.local_areas.local_authorities import loss + + (_storage := tmp_path / "storage").mkdir() + _fake_la_codes().to_csv(_storage / "local_authorities_2021.csv", index=False) + + monkeypatch.setattr(loss, "STORAGE_FOLDER", _storage) + monkeypatch.setattr(loss, "Microsimulation", _FakeSim) + monkeypatch.setattr(loss, "INCOME_VARIABLES", ["employment_income"]) + monkeypatch.setattr( + loss, + "get_la_income_targets", + lambda: pd.DataFrame( + { + "employment_income_amount": [1.0, 1.0, 1.0, 1.0], + "employment_income_count": [1.0, 1.0, 1.0, 1.0], + } + ), + ) + monkeypatch.setattr( + loss, + "get_national_income_projections", + lambda year: pd.DataFrame( + { + "total_income_lower_bound": [12_570], + "total_income_upper_bound": [np.inf], + "employment_income_amount": [4.0], + } + ), + ) + monkeypatch.setattr( + loss, + "get_la_age_targets", + lambda: pd.DataFrame({"age/0_100": [1.0, 1.0, 1.0, 1.0]}), + ) + monkeypatch.setattr(loss, "get_uk_total_population", lambda year: 4.0) + monkeypatch.setattr(loss, "get_la_uc_targets", lambda: pd.Series([0, 1, 0, 0])) + monkeypatch.setattr( + loss, + "get_ons_income_uprating_factors", + lambda year: (1.0, 1.0), + ) + monkeypatch.setattr( + loss, + "load_household_counts", + lambda: pd.DataFrame( + { + "la_code": ["E06000001", "W06000001"], + "households": [100.0, 200.0], + } + ), + ) + return loss + + +def test_la_loss_masks_missing_ons_income_cells(monkeypatch, tmp_path): + loss = _patch_common_la_inputs(monkeypatch, tmp_path) + monkeypatch.setattr( + loss, + "load_ons_la_income", + lambda: pd.DataFrame( + { + "la_code": ["E06000001", "W06000001"], + "net_income_bhc": [30_000.0, 25_000.0], + "net_income_ahc": [26_000.0, 21_000.0], + } + ), + ) + monkeypatch.setattr( + loss, + "load_tenure_data", + lambda: pd.DataFrame( + { + "la_code": ["E06000001"], + "owned_outright_pct": [30.0], + "owned_mortgage_pct": [30.0], + "private_rent_pct": [25.0], + "social_rent_pct": [15.0], + } + ), + ) + monkeypatch.setattr( + loss, + "load_private_rents", + lambda: pd.DataFrame( + {"area_code": ["E06000001"], "median_annual_rent": [12_000.0]} + ), + ) + + _, y, _ = loss.create_local_authority_target_matrix(_FakeDataset()) + + direct = y["ons/equiv_net_income_bhc"].iloc[:2] + missing = y["ons/equiv_net_income_bhc"].iloc[2:] + assert direct.notna().all() + assert missing.isna().all() + + +def test_la_loss_masks_missing_tenure_and_rent_cells(monkeypatch, tmp_path): + loss = _patch_common_la_inputs(monkeypatch, tmp_path) + monkeypatch.setattr( + loss, + "load_ons_la_income", + lambda: pd.DataFrame( + { + "la_code": ["E06000001", "W06000001"], + "net_income_bhc": [30_000.0, 25_000.0], + "net_income_ahc": [26_000.0, 21_000.0], + } + ), + ) + monkeypatch.setattr( + loss, + "load_tenure_data", + lambda: pd.DataFrame( + { + "la_code": ["E06000001"], + "owned_outright_pct": [30.0], + "owned_mortgage_pct": [30.0], + "private_rent_pct": [25.0], + "social_rent_pct": [15.0], + } + ), + ) + monkeypatch.setattr( + loss, + "load_private_rents", + lambda: pd.DataFrame( + {"area_code": ["E06000001"], "median_annual_rent": [12_000.0]} + ), + ) + + _, y, _ = loss.create_local_authority_target_matrix(_FakeDataset()) + + for column in [ + "tenure/owned_outright", + "tenure/owned_mortgage", + "tenure/private_rent", + "tenure/social_rent", + "rent/private_rent", + ]: + assert pd.notna(y[column].iloc[0]), f"{column}: direct cell should be finite" + assert y[column].iloc[1:].isna().all(), ( + f"{column}: missing-source cells should be masked" + )