From 7bd461ef8729139d009fa947e5f049d2572376ed Mon Sep 17 00:00:00 2001
From: Max Ghenis <mghenis@gmail.com>
Date: Sun, 24 May 2026 13:00:07 -0400
Subject: [PATCH] Mask missing LA calibration targets

---
 .../local_areas/local_authorities/loss.py     |  33 ++--
 .../tests/test_la_loss_council_tax.py         |   6 +-
 .../tests/test_la_loss_missing_sources.py     | 182 ++++++++++++++++++
 3 files changed, 198 insertions(+), 23 deletions(-)
 create mode 100644 policyengine_uk_data/tests/test_la_loss_missing_sources.py

diff --git a/policyengine_uk_data/datasets/local_areas/local_authorities/loss.py b/policyengine_uk_data/datasets/local_areas/local_authorities/loss.py
index 0993c53c5..d2455e22f 100644
--- a/policyengine_uk_data/datasets/local_areas/local_authorities/loss.py
+++ b/policyengine_uk_data/datasets/local_areas/local_authorities/loss.py
@@ -14,6 +14,12 @@
 - Council tax bands A-H: VOA Council Tax Stock of Properties (per LA)
 - Council tax £ paid (net of CTR): MHCLG taxbase × Band D (England),
   Welsh Government Council Tax Income (Wales)
+
+Missing-source policy: local target cells stay NaN when no direct LA
+source is available. The local-area calibrator masks those cells out of
+the local loss. National targets are supplied by a separate national
+target matrix, so this module should not fabricate local targets by
+allocating national totals across missing-source LAs.
 """
 
 from policyengine_uk import Microsimulation
@@ -55,7 +61,6 @@ def create_local_authority_target_matrix(
 
     sim = Microsimulation(dataset=dataset, reform=reform)
     sim.default_calculation_period = time_period
-    original_weights = sim.calculate("household_weight", time_period).values
 
     matrix = pd.DataFrame()
     y = pd.DataFrame()
@@ -154,31 +159,20 @@ def create_local_authority_target_matrix(
     has_ons_data = (
         ons_merged["net_income_bhc"].notna() & ons_merged["households"].notna()
     ).values
-    total_households = ons_merged["households"].sum()
-    la_household_share = np.where(
-        ons_merged["households"].notna(),
-        ons_merged["households"].values / total_households,
-        1 / len(la_codes),
-    )
-
-    national_bhc = (original_weights * hbai_net_income).sum()
-    national_ahc = (original_weights * hbai_net_income_ahc).sum()
-    national_hc = (original_weights * housing_costs).sum()
-
     y["ons/equiv_net_income_bhc"] = np.where(
         has_ons_data,
         ons_merged["equiv_net_income_bhc_target"].values,
-        national_bhc * la_household_share,
+        np.nan,
     )
     y["ons/equiv_net_income_ahc"] = np.where(
         has_ons_data,
         ons_merged["equiv_net_income_ahc_target"].values,
-        national_ahc * la_household_share,
+        np.nan,
     )
     y["ons/equiv_housing_costs"] = np.where(
         has_ons_data,
         ons_merged["equiv_housing_costs_target"].values,
-        national_hc * la_household_share,
+        np.nan,
     )
 
     # ── Tenure targets ─────────────────────────────────────────────
@@ -216,9 +210,10 @@ def create_local_authority_target_matrix(
         ("social_rent", "social_rent_pct"),
     ]:
         targets = tenure_merged[pct_col] / 100 * tenure_merged["households"]
-        national = (original_weights * matrix[f"tenure/{tenure_key}"].values).sum()
         y[f"tenure/{tenure_key}"] = np.where(
-            has_tenure, targets.values, national * la_household_share
+            has_tenure,
+            targets.values,
+            np.nan,
         )
 
     # ── Private rent amounts ───────────────────────────────────────
@@ -247,12 +242,10 @@ def create_local_authority_target_matrix(
         & tenure_merged["private_rent_pct"].notna()
         & tenure_merged["households"].notna()
     ).values
-    national_rent = (original_weights * private_rent_amount).sum()
-
     y["rent/private_rent"] = np.where(
         has_rent,
         tenure_merged["private_rent_target"].values,
-        national_rent * la_household_share,
+        np.nan,
     )
 
     # ── Council tax band counts (LA targets) ───────────────────────
diff --git a/policyengine_uk_data/tests/test_la_loss_council_tax.py b/policyengine_uk_data/tests/test_la_loss_council_tax.py
index c33949e06..c36574547 100644
--- a/policyengine_uk_data/tests/test_la_loss_council_tax.py
+++ b/policyengine_uk_data/tests/test_la_loss_council_tax.py
@@ -49,8 +49,8 @@ def test_band_count_columns_exist_for_every_wired_band():
 
 def test_england_and_wales_have_band_a_to_h_populated():
     """E/W rows should have non-null counts for A-H. If the CSV regresses
-    to NaN there, the loss matrix will silently fall back to the
-    national-share estimate and the calibrator loses its real signal."""
+    to NaN there, the loss matrix will mask the cell and the calibrator
+    loses its real signal."""
     ew = CT_DATA[CT_DATA["country"].isin(["ENGLAND", "WALES"])]
     for band in WIRED_BANDS:
         non_null = ew[f"count_band_{band}"].notna().sum()
@@ -62,7 +62,7 @@ def test_england_and_wales_have_band_a_to_h_populated():
 
 def test_scotland_band_counts_are_null_as_documented():
     """Scotland VOA band counts are absent — they should consistently be
-    NaN so the loss matrix routes them through the fallback."""
+    NaN so the loss matrix masks them."""
     scotland = CT_DATA[CT_DATA["country"] == "SCOTLAND"]
     for band in WIRED_BANDS:
         assert scotland[f"count_band_{band}"].isna().all(), (
diff --git a/policyengine_uk_data/tests/test_la_loss_missing_sources.py b/policyengine_uk_data/tests/test_la_loss_missing_sources.py
new file mode 100644
index 000000000..11f078fe2
--- /dev/null
+++ b/policyengine_uk_data/tests/test_la_loss_missing_sources.py
@@ -0,0 +1,182 @@
+import numpy as np
+import pandas as pd
+
+
+class _FakeDataset:
+    time_period = 2025
+
+
+class _FakeSim:
+    def __init__(self, *args, **kwargs):
+        self.default_calculation_period = 2025
+
+    def calculate(self, variable, *args, **kwargs):
+        values = {
+            "employment_income": np.array([10_000.0, 30_000.0]),
+            "income_tax": np.array([1.0, 1.0]),
+            "age": np.array([40, 70]),
+            "universal_credit": np.array([0.0, 1.0]),
+            "equiv_hbai_household_net_income": np.array([20_000.0, 25_000.0]),
+            "equiv_hbai_household_net_income_ahc": np.array([18_000.0, 22_000.0]),
+            "tenure_type": np.array(["RENT_PRIVATELY", "OWNED_OUTRIGHT"]),
+            "benunit_rent": np.array([12_000.0, 0.0]),
+            "country": np.array(["ENGLAND", "SCOTLAND"]),
+        }
+        return type("Result", (), {"values": values[variable]})()
+
+    def map_result(self, values, source_entity, target_entity):
+        return np.asarray(values)
+
+
+def _fake_la_codes():
+    return pd.DataFrame(
+        {
+            "code": ["E06000001", "W06000001", "S12000001", "N09000001"],
+        }
+    )
+
+
+def _patch_common_la_inputs(monkeypatch, tmp_path):
+    from policyengine_uk_data.datasets.local_areas.local_authorities import loss
+
+    (_storage := tmp_path / "storage").mkdir()
+    _fake_la_codes().to_csv(_storage / "local_authorities_2021.csv", index=False)
+
+    monkeypatch.setattr(loss, "STORAGE_FOLDER", _storage)
+    monkeypatch.setattr(loss, "Microsimulation", _FakeSim)
+    monkeypatch.setattr(loss, "INCOME_VARIABLES", ["employment_income"])
+    monkeypatch.setattr(
+        loss,
+        "get_la_income_targets",
+        lambda: pd.DataFrame(
+            {
+                "employment_income_amount": [1.0, 1.0, 1.0, 1.0],
+                "employment_income_count": [1.0, 1.0, 1.0, 1.0],
+            }
+        ),
+    )
+    monkeypatch.setattr(
+        loss,
+        "get_national_income_projections",
+        lambda year: pd.DataFrame(
+            {
+                "total_income_lower_bound": [12_570],
+                "total_income_upper_bound": [np.inf],
+                "employment_income_amount": [4.0],
+            }
+        ),
+    )
+    monkeypatch.setattr(
+        loss,
+        "get_la_age_targets",
+        lambda: pd.DataFrame({"age/0_100": [1.0, 1.0, 1.0, 1.0]}),
+    )
+    monkeypatch.setattr(loss, "get_uk_total_population", lambda year: 4.0)
+    monkeypatch.setattr(loss, "get_la_uc_targets", lambda: pd.Series([0, 1, 0, 0]))
+    monkeypatch.setattr(
+        loss,
+        "get_ons_income_uprating_factors",
+        lambda year: (1.0, 1.0),
+    )
+    monkeypatch.setattr(
+        loss,
+        "load_household_counts",
+        lambda: pd.DataFrame(
+            {
+                "la_code": ["E06000001", "W06000001"],
+                "households": [100.0, 200.0],
+            }
+        ),
+    )
+    return loss
+
+
+def test_la_loss_masks_missing_ons_income_cells(monkeypatch, tmp_path):
+    loss = _patch_common_la_inputs(monkeypatch, tmp_path)
+    monkeypatch.setattr(
+        loss,
+        "load_ons_la_income",
+        lambda: pd.DataFrame(
+            {
+                "la_code": ["E06000001", "W06000001"],
+                "net_income_bhc": [30_000.0, 25_000.0],
+                "net_income_ahc": [26_000.0, 21_000.0],
+            }
+        ),
+    )
+    monkeypatch.setattr(
+        loss,
+        "load_tenure_data",
+        lambda: pd.DataFrame(
+            {
+                "la_code": ["E06000001"],
+                "owned_outright_pct": [30.0],
+                "owned_mortgage_pct": [30.0],
+                "private_rent_pct": [25.0],
+                "social_rent_pct": [15.0],
+            }
+        ),
+    )
+    monkeypatch.setattr(
+        loss,
+        "load_private_rents",
+        lambda: pd.DataFrame(
+            {"area_code": ["E06000001"], "median_annual_rent": [12_000.0]}
+        ),
+    )
+
+    _, y, _ = loss.create_local_authority_target_matrix(_FakeDataset())
+
+    direct = y["ons/equiv_net_income_bhc"].iloc[:2]
+    missing = y["ons/equiv_net_income_bhc"].iloc[2:]
+    assert direct.notna().all()
+    assert missing.isna().all()
+
+
+def test_la_loss_masks_missing_tenure_and_rent_cells(monkeypatch, tmp_path):
+    loss = _patch_common_la_inputs(monkeypatch, tmp_path)
+    monkeypatch.setattr(
+        loss,
+        "load_ons_la_income",
+        lambda: pd.DataFrame(
+            {
+                "la_code": ["E06000001", "W06000001"],
+                "net_income_bhc": [30_000.0, 25_000.0],
+                "net_income_ahc": [26_000.0, 21_000.0],
+            }
+        ),
+    )
+    monkeypatch.setattr(
+        loss,
+        "load_tenure_data",
+        lambda: pd.DataFrame(
+            {
+                "la_code": ["E06000001"],
+                "owned_outright_pct": [30.0],
+                "owned_mortgage_pct": [30.0],
+                "private_rent_pct": [25.0],
+                "social_rent_pct": [15.0],
+            }
+        ),
+    )
+    monkeypatch.setattr(
+        loss,
+        "load_private_rents",
+        lambda: pd.DataFrame(
+            {"area_code": ["E06000001"], "median_annual_rent": [12_000.0]}
+        ),
+    )
+
+    _, y, _ = loss.create_local_authority_target_matrix(_FakeDataset())
+
+    for column in [
+        "tenure/owned_outright",
+        "tenure/owned_mortgage",
+        "tenure/private_rent",
+        "tenure/social_rent",
+        "rent/private_rent",
+    ]:
+        assert pd.notna(y[column].iloc[0]), f"{column}: direct cell should be finite"
+        assert y[column].iloc[1:].isna().all(), (
+            f"{column}: missing-source cells should be masked"
+        )