From baa30dd84b6d6c49356e12d7cede29871aa5a239 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sat, 23 May 2026 20:39:14 -0400 Subject: [PATCH 1/9] Update SPI ingestion to 2022-23 --- changelog.d/spi-2022-23.md | 1 + .../datasets/imputations/income.py | 64 ++++++++++--------- policyengine_uk_data/datasets/spi.py | 20 ++++-- .../storage/download_private_prerequisites.py | 3 +- .../tests/test_frs_prerequisites.py | 8 +++ policyengine_uk_data/tests/test_spi_build.py | 40 ++++++++++++ .../utils/incomes_projection.py | 23 ++++--- 7 files changed, 113 insertions(+), 46 deletions(-) create mode 100644 changelog.d/spi-2022-23.md diff --git a/changelog.d/spi-2022-23.md b/changelog.d/spi-2022-23.md new file mode 100644 index 000000000..d61f1e622 --- /dev/null +++ b/changelog.d/spi-2022-23.md @@ -0,0 +1 @@ +Update SPI private prerequisites and income imputation to the 2022-23 Public Use Tape. diff --git a/policyengine_uk_data/datasets/imputations/income.py b/policyengine_uk_data/datasets/imputations/income.py index 586fc0fd1..c6d537d41 100644 --- a/policyengine_uk_data/datasets/imputations/income.py +++ b/policyengine_uk_data/datasets/imputations/income.py @@ -11,10 +11,16 @@ from policyengine_uk_data.storage import STORAGE_FOLDER from policyengine_uk.data import UKSingleYearDataset from policyengine_uk import Microsimulation +from policyengine_uk_data.datasets.spi import ( + AGE_RANGES, + REGION_MAP, + SPI_RELEASE_NAME, + SPI_TAB_FILENAME, +) from policyengine_uk_data.utils.stack import stack_datasets from policyengine_uk_data.utils.subsample import subsample_dataset -SPI_TAB_FOLDER = STORAGE_FOLDER / "spi_2020_21" +SPI_TAB_FOLDER = STORAGE_FOLDER / SPI_RELEASE_NAME SPI_RENAMES = dict( private_pension_income="PENSION", self_employment_income="PROFITS", @@ -37,7 +43,18 @@ ) -def generate_spi_table(spi: pd.DataFrame): +def _spi_age_bounds(age_code) -> tuple[int, int]: + try: + return AGE_RANGES[int(age_code)] + except (TypeError, ValueError, KeyError): + return AGE_RANGES[-1] + + +def generate_spi_table( + spi: pd.DataFrame, + seed: int = 0, + sample_size: int | None = 100_000, +): """ Clean and transform SPI data for income imputation model training. @@ -47,29 +64,12 @@ def generate_spi_table(spi: pd.DataFrame): Returns: Cleaned DataFrame with age and region mappings applied. """ - LOWER = np.array([0, 16, 25, 35, 45, 55, 65, 75]) - UPPER = np.array([16, 25, 35, 45, 55, 65, 75, 80]) + rng = np.random.default_rng(seed) age_range = spi.AGERANGE - spi["age"] = LOWER[age_range] + np.random.rand(len(spi)) * ( - UPPER[age_range] - LOWER[age_range] - ) + bounds = np.array([_spi_age_bounds(age) for age in age_range]) + spi["age"] = bounds[:, 0] + rng.random(len(spi)) * (bounds[:, 1] - bounds[:, 0]) - REGIONS = { - 1: "NORTH_EAST", - 2: "NORTH_WEST", - 3: "YORKSHIRE", - 4: "EAST_MIDLANDS", - 5: "WEST_MIDLANDS", - 6: "EAST_OF_ENGLAND", - 7: "LONDON", - 8: "SOUTH_EAST", - 9: "SOUTH_WEST", - 10: "WALES", - 11: "SCOTLAND", - 12: "NORTHERN_IRELAND", - } - - spi["region"] = np.array([REGIONS.get(x, "LONDON") for x in spi.GORCODE]) + spi["region"] = spi.GORCODE.map(REGION_MAP).fillna("UNKNOWN") spi["gender"] = np.where(spi.SEX == 1, "MALE", "FEMALE") @@ -78,11 +78,17 @@ def generate_spi_table(spi: pd.DataFrame): spi["employment_income"] = spi[["PAY", "EPB", "TAXTERM"]].sum(axis=1) - spi = pd.concat( - [ - spi.sample(100_000, weights=spi.person_weight, replace=True), - ] - ) + if sample_size is not None: + spi = pd.concat( + [ + spi.sample( + sample_size, + weights=spi.person_weight, + replace=True, + random_state=seed, + ), + ] + ) return spi @@ -132,7 +138,7 @@ def save_imputation_models(): from policyengine_uk_data.utils import QRF income = QRF() - spi = pd.read_csv(SPI_TAB_FOLDER / "put2021uk.tab", delimiter="\t") + spi = pd.read_csv(SPI_TAB_FOLDER / SPI_TAB_FILENAME, delimiter="\t") spi = generate_spi_table(spi) spi = spi[PREDICTORS + IMPUTATIONS] income.fit(spi[PREDICTORS], spi[IMPUTATIONS]) diff --git a/policyengine_uk_data/datasets/spi.py b/policyengine_uk_data/datasets/spi.py index ed13929a4..e22be675c 100644 --- a/policyengine_uk_data/datasets/spi.py +++ b/policyengine_uk_data/datasets/spi.py @@ -3,6 +3,11 @@ import numpy as np from policyengine_uk.data import UKSingleYearDataset +SPI_RELEASE_NAME = "spi_2022_23" +SPI_TAB_FILENAME = "put2223uk.tab" +SPI_FISCAL_YEAR = 2022 +SPI_H5_FILENAME = "spi_2022_23.h5" + # Age-range midpoints for random age imputation. # Key -1 covers records with no reported AGERANGE — use a broad working-age @@ -86,8 +91,8 @@ def create_spi( """Build a :class:`UKSingleYearDataset` from an SPI microdata `.tab` file. Args: - spi_data_file_path: Path to the SPI `.tab` file (e.g. `put2021uk.tab`). - fiscal_year: UK fiscal year for the dataset (e.g. 2020 → 2020-21). + spi_data_file_path: Path to the SPI `.tab` file (e.g. `put2223uk.tab`). + fiscal_year: UK fiscal year for the dataset (e.g. 2022 → 2022-23). output_file_path: Unused here — callers may save the returned dataset themselves with ``dataset.save(path)``. Kept as a kwarg so existing call sites don't break. @@ -142,8 +147,9 @@ def create_spi( # generator so builds are reproducible (previously used the unseeded # global np.random.rand). percent_along_age_range = rng.random(len(df)) - min_age = np.array([AGE_RANGES[age][0] for age in age_range]) - max_age = np.array([AGE_RANGES[age][1] for age in age_range]) + bounds = np.array([AGE_RANGES.get(int(age), AGE_RANGES[-1]) for age in age_range]) + min_age = bounds[:, 0] + max_age = bounds[:, 1] person["age"] = (min_age + (max_age - min_age) * percent_along_age_range).astype( int ) @@ -174,8 +180,8 @@ def create_spi( if __name__ == "__main__": - spi_data_file_path = STORAGE_FOLDER / "spi_2020_21" / "put2021uk.tab" - fiscal_year = 2020 - output_file_path = STORAGE_FOLDER / "spi_2020.h5" + spi_data_file_path = STORAGE_FOLDER / SPI_RELEASE_NAME / SPI_TAB_FILENAME + fiscal_year = SPI_FISCAL_YEAR + output_file_path = STORAGE_FOLDER / SPI_H5_FILENAME spi = create_spi(spi_data_file_path, fiscal_year, output_file_path) spi.save(output_file_path) diff --git a/policyengine_uk_data/storage/download_private_prerequisites.py b/policyengine_uk_data/storage/download_private_prerequisites.py index 05e081192..815759a76 100644 --- a/policyengine_uk_data/storage/download_private_prerequisites.py +++ b/policyengine_uk_data/storage/download_private_prerequisites.py @@ -1,4 +1,5 @@ from policyengine_uk_data.datasets.frs_release import CURRENT_FRS_RELEASE +from policyengine_uk_data.datasets.spi import SPI_RELEASE_NAME from policyengine_uk_data.utils.hf_destinations import PRIVATE_REPO from policyengine_uk_data.utils.huggingface import download from pathlib import Path @@ -13,7 +14,7 @@ ("lcfs_2021_22.zip", None), ("was_2006_20.zip", None), ("etb_1977_21.zip", None), - ("spi_2020_21.zip", None), + (f"{SPI_RELEASE_NAME}.zip", None), ] diff --git a/policyengine_uk_data/tests/test_frs_prerequisites.py b/policyengine_uk_data/tests/test_frs_prerequisites.py index f4753577f..5c22c50b9 100644 --- a/policyengine_uk_data/tests/test_frs_prerequisites.py +++ b/policyengine_uk_data/tests/test_frs_prerequisites.py @@ -10,6 +10,7 @@ _needs_calibration_year_materialization, ) from policyengine_uk_data.datasets.frs_release import CURRENT_FRS_RELEASE +from policyengine_uk_data.datasets.spi import SPI_RELEASE_NAME from policyengine_uk_data.storage.download_private_prerequisites import ( PRIVATE_PREREQUISITES, extract_zipped_folder, @@ -23,6 +24,13 @@ def test_private_prerequisites_use_current_frs_release(): assert "frs_2023_24.zip" not in prerequisite_names +def test_private_prerequisites_use_current_spi_release(): + prerequisite_names = [filename for filename, _ in PRIVATE_PREREQUISITES] + + assert f"{SPI_RELEASE_NAME}.zip" in prerequisite_names + assert "spi_2020_21.zip" not in prerequisite_names + + def test_current_frs_release_uses_survey_year_as_base_year(): assert CURRENT_FRS_RELEASE.base_year == CURRENT_FRS_RELEASE.survey_year diff --git a/policyengine_uk_data/tests/test_spi_build.py b/policyengine_uk_data/tests/test_spi_build.py index 9ecd809df..36f177c31 100644 --- a/policyengine_uk_data/tests/test_spi_build.py +++ b/policyengine_uk_data/tests/test_spi_build.py @@ -30,6 +30,7 @@ SPI_COLUMNS = [ + "SEX", "SREF", "FACT", "DIVIDENDS", @@ -166,3 +167,42 @@ def test_create_spi_marriage_allowance_uses_fiscal_year_parameters(tmp_path): # either, but require it's NOT the stale 2020-21 £1,250 figure. assert marriage_2025[0] != 1_250 assert marriage_2025[0] >= 1_250 # PA has only risen since 2020 + + +def test_current_spi_release_metadata_points_to_2022_23(): + from policyengine_uk_data.datasets.spi import ( + SPI_FISCAL_YEAR, + SPI_H5_FILENAME, + SPI_RELEASE_NAME, + SPI_TAB_FILENAME, + ) + + assert SPI_RELEASE_NAME == "spi_2022_23" + assert SPI_TAB_FILENAME == "put2223uk.tab" + assert SPI_FISCAL_YEAR == 2022 + assert SPI_H5_FILENAME == "spi_2022_23.h5" + + +def test_income_spi_generation_handles_current_unknown_codes(): + from policyengine_uk_data.datasets.imputations.income import generate_spi_table + + data = {col: np.zeros(1, dtype=float) for col in SPI_COLUMNS} + data["SREF"] = [1] + data["FACT"] = [1] + data["SEX"] = [1] + data["GORCODE"] = [13] + data["AGERANGE"] = [-1] + spi = pd.DataFrame(data) + + out = generate_spi_table(spi, seed=0, sample_size=5) + + assert out["region"].tolist() == ["UNKNOWN"] * 5 + assert out["age"].between(16, 70, inclusive="left").all() + + +def test_income_projection_uses_current_spi_release(): + from policyengine_uk_data.datasets.spi import SPI_FISCAL_YEAR, SPI_H5_FILENAME + from policyengine_uk_data.utils import incomes_projection + + assert incomes_projection.SPI_DATASET.endswith(SPI_H5_FILENAME) + assert incomes_projection.SPI_FISCAL_YEAR == SPI_FISCAL_YEAR diff --git a/policyengine_uk_data/utils/incomes_projection.py b/policyengine_uk_data/utils/incomes_projection.py index 9edb92f12..2e33a230d 100644 --- a/policyengine_uk_data/utils/incomes_projection.py +++ b/policyengine_uk_data/utils/incomes_projection.py @@ -5,10 +5,12 @@ import warnings from policyengine_uk import Microsimulation from microcalibrate import Calibration -from policyengine_uk_data.datasets import SPI_2020_21 +from policyengine_uk_data.datasets.spi import SPI_FISCAL_YEAR, SPI_H5_FILENAME warnings.filterwarnings("ignore") +SPI_DATASET = str(STORAGE_FOLDER / SPI_H5_FILENAME) + tax_benefit = pd.read_csv(STORAGE_FOLDER / "tax_benefit.csv") tax_benefit["name"] = tax_benefit["name"].apply(lambda x: f"obr/{x}") demographics = pd.read_csv(STORAGE_FOLDER / "demographics.csv") @@ -78,10 +80,13 @@ def create_target_matrix( incomes = pd.read_csv(STORAGE_FOLDER / "incomes.csv") for variable in REWEIGHT_VARIABLES: incomes[variable + "_count"] = uprate_values( - incomes[variable + "_count"], "household_weight", 2021, time_period + incomes[variable + "_count"], + "household_weight", + SPI_FISCAL_YEAR, + time_period, ) incomes[variable + "_amount"] = uprate_values( - incomes[variable + "_amount"], variable, 2021, time_period + incomes[variable + "_amount"], variable, SPI_FISCAL_YEAR, time_period ) for i, row in incomes.iterrows(): @@ -143,10 +148,10 @@ def get_loss_results(dataset, time_period, reform=None): def create_income_projections(): - loss_matrix, targets_array = create_target_matrix(SPI_2020_21, 2022) + loss_matrix, targets_array = create_target_matrix(SPI_DATASET, SPI_FISCAL_YEAR) - sim = Microsimulation(dataset=SPI_2020_21) - household_weights = sim.calculate("household_weight", 2022).values + sim = Microsimulation(dataset=SPI_DATASET) + household_weights = sim.calculate("household_weight", SPI_FISCAL_YEAR).values calibration = Calibration( weights=household_weights, @@ -158,8 +163,8 @@ def create_income_projections(): calibration.calibrate() reweighted_weights = calibration.weights - sim = Microsimulation(dataset=SPI_2020_21) - sim.set_input("household_weight", 2022, reweighted_weights) + sim = Microsimulation(dataset=SPI_DATASET) + sim.set_input("household_weight", SPI_FISCAL_YEAR, reweighted_weights) incomes = pd.read_csv(STORAGE_FOLDER / "incomes.csv") @@ -167,7 +172,7 @@ def create_income_projections(): lower_bounds = incomes.total_income_lower_bound upper_bounds = incomes.total_income_upper_bound - for year in range(2022, 2030): + for year in range(SPI_FISCAL_YEAR, 2030): year_df = pd.DataFrame() year_df["total_income_lower_bound"] = lower_bounds year_df["total_income_upper_bound"] = upper_bounds From 2c05b7d64528ab325552cc9f62ea5c7d58a9d83e Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sat, 23 May 2026 21:33:02 -0400 Subject: [PATCH 2/9] Invalidate SPI income model cache by release --- .../datasets/imputations/income.py | 28 +++++--- policyengine_uk_data/tests/test_spi_build.py | 67 +++++++++++++++++++ policyengine_uk_data/utils/qrf.py | 10 ++- 3 files changed, 96 insertions(+), 9 deletions(-) diff --git a/policyengine_uk_data/datasets/imputations/income.py b/policyengine_uk_data/datasets/imputations/income.py index c6d537d41..a3c829f16 100644 --- a/policyengine_uk_data/datasets/imputations/income.py +++ b/policyengine_uk_data/datasets/imputations/income.py @@ -125,7 +125,20 @@ def generate_spi_table( IMPUTATIONS = INCOME_COMPONENTS + ["gift_aid", "charitable_investment_gifts"] -INCOME_MODEL_PATH = STORAGE_FOLDER / "income.pkl" +INCOME_MODEL_METADATA = { + "spi_release_name": SPI_RELEASE_NAME, + "spi_tab_filename": SPI_TAB_FILENAME, + "imputations": tuple(IMPUTATIONS), +} +INCOME_MODEL_PATH = STORAGE_FOLDER / f"income_{SPI_RELEASE_NAME}.pkl" + + +def _income_model_matches_current_release(model) -> bool: + if getattr(model, "metadata", {}) != INCOME_MODEL_METADATA: + return False + + cached_outputs = set(getattr(model.model, "imputed_variables", [])) + return cached_outputs == set(IMPUTATIONS) def save_imputation_models(): @@ -138,6 +151,7 @@ def save_imputation_models(): from policyengine_uk_data.utils import QRF income = QRF() + income.metadata = INCOME_MODEL_METADATA spi = pd.read_csv(SPI_TAB_FOLDER / SPI_TAB_FILENAME, delimiter="\t") spi = generate_spi_table(spi) spi = spi[PREDICTORS + IMPUTATIONS] @@ -150,10 +164,9 @@ def create_income_model(overwrite_existing: bool = False): """ Create or load income imputation model. - If a cached model exists and its trained output columns don't match the - current ``IMPUTATIONS`` list, the cache is discarded and the model is - retrained. This handles the case where ``IMPUTATIONS`` is extended in - code but an older pickle is still on disk. + If a cached model exists and its training metadata or output columns don't + match the current SPI release and ``IMPUTATIONS`` list, the cache is + discarded and the model is retrained. Args: overwrite_existing: Whether to retrain model if it exists. @@ -165,10 +178,9 @@ def create_income_model(overwrite_existing: bool = False): if INCOME_MODEL_PATH.exists() and not overwrite_existing: cached = QRF(file_path=INCOME_MODEL_PATH) - cached_outputs = set(getattr(cached.model, "imputed_variables", [])) - if cached_outputs == set(IMPUTATIONS): + if _income_model_matches_current_release(cached): return cached - # Cached model was trained against a different output set; retrain. + # Cached model was trained against a different SPI release or output set. return save_imputation_models() diff --git a/policyengine_uk_data/tests/test_spi_build.py b/policyengine_uk_data/tests/test_spi_build.py index 36f177c31..f6c4ff4a2 100644 --- a/policyengine_uk_data/tests/test_spi_build.py +++ b/policyengine_uk_data/tests/test_spi_build.py @@ -17,6 +17,8 @@ import importlib.util import inspect +import pickle +from types import SimpleNamespace import numpy as np import pandas as pd @@ -206,3 +208,68 @@ def test_income_projection_uses_current_spi_release(): assert incomes_projection.SPI_DATASET.endswith(SPI_H5_FILENAME) assert incomes_projection.SPI_FISCAL_YEAR == SPI_FISCAL_YEAR + + +def test_income_model_cache_is_release_scoped(): + from policyengine_uk_data.datasets.imputations.income import ( + INCOME_MODEL_PATH, + ) + from policyengine_uk_data.datasets.spi import SPI_RELEASE_NAME + + assert INCOME_MODEL_PATH.name == f"income_{SPI_RELEASE_NAME}.pkl" + + +def test_income_model_cache_rejects_stale_spi_release(tmp_path, monkeypatch): + from policyengine_uk_data.datasets.imputations import income as income_module + + cache = tmp_path / "income_spi_2022_23.pkl" + stale_metadata = { + **income_module.INCOME_MODEL_METADATA, + "spi_release_name": "spi_2020_21", + "spi_tab_filename": "put2021uk.tab", + } + with cache.open("wb") as f: + pickle.dump( + { + "model": SimpleNamespace( + imputed_variables=list(income_module.IMPUTATIONS) + ), + "input_columns": income_module.PREDICTORS, + "metadata": stale_metadata, + }, + f, + ) + + sentinel = object() + monkeypatch.setattr(income_module, "INCOME_MODEL_PATH", cache) + monkeypatch.setattr(income_module, "save_imputation_models", lambda: sentinel) + + assert income_module.create_income_model() is sentinel + + +def test_income_model_cache_accepts_current_spi_release(tmp_path, monkeypatch): + from policyengine_uk_data.datasets.imputations import income as income_module + + cache = tmp_path / "income_spi_2022_23.pkl" + with cache.open("wb") as f: + pickle.dump( + { + "model": SimpleNamespace( + imputed_variables=list(income_module.IMPUTATIONS) + ), + "input_columns": income_module.PREDICTORS, + "metadata": income_module.INCOME_MODEL_METADATA, + }, + f, + ) + + monkeypatch.setattr(income_module, "INCOME_MODEL_PATH", cache) + monkeypatch.setattr( + income_module, + "save_imputation_models", + lambda: pytest.fail("current SPI release cache should be reused"), + ) + + assert income_module.create_income_model().metadata == ( + income_module.INCOME_MODEL_METADATA + ) diff --git a/policyengine_uk_data/utils/qrf.py b/policyengine_uk_data/utils/qrf.py index 05c0ba661..f85e0778a 100644 --- a/policyengine_uk_data/utils/qrf.py +++ b/policyengine_uk_data/utils/qrf.py @@ -39,6 +39,7 @@ def __init__(self, file_path: str = None): data = pickle.load(f) self.model = data["model"] self.input_columns = data["input_columns"] + self.metadata = data.get("metadata", {}) def fit(self, X, y): """ @@ -74,4 +75,11 @@ def save(self, file_path: str): file_path: Path where model should be saved. """ with open(file_path, "wb") as f: - pickle.dump({"model": self.model, "input_columns": self.input_columns}, f) + pickle.dump( + { + "model": self.model, + "input_columns": self.input_columns, + "metadata": getattr(self, "metadata", {}), + }, + f, + ) From 2149508b23f9d44ece773230a5e7c0fc121174c7 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sat, 23 May 2026 22:08:21 -0400 Subject: [PATCH 3/9] Keep SPI projections scoped to target refresh --- policyengine_uk_data/tests/test_spi_build.py | 8 ------- .../utils/incomes_projection.py | 23 ++++++++----------- 2 files changed, 9 insertions(+), 22 deletions(-) diff --git a/policyengine_uk_data/tests/test_spi_build.py b/policyengine_uk_data/tests/test_spi_build.py index f6c4ff4a2..c78b3aea6 100644 --- a/policyengine_uk_data/tests/test_spi_build.py +++ b/policyengine_uk_data/tests/test_spi_build.py @@ -202,14 +202,6 @@ def test_income_spi_generation_handles_current_unknown_codes(): assert out["age"].between(16, 70, inclusive="left").all() -def test_income_projection_uses_current_spi_release(): - from policyengine_uk_data.datasets.spi import SPI_FISCAL_YEAR, SPI_H5_FILENAME - from policyengine_uk_data.utils import incomes_projection - - assert incomes_projection.SPI_DATASET.endswith(SPI_H5_FILENAME) - assert incomes_projection.SPI_FISCAL_YEAR == SPI_FISCAL_YEAR - - def test_income_model_cache_is_release_scoped(): from policyengine_uk_data.datasets.imputations.income import ( INCOME_MODEL_PATH, diff --git a/policyengine_uk_data/utils/incomes_projection.py b/policyengine_uk_data/utils/incomes_projection.py index 2e33a230d..9edb92f12 100644 --- a/policyengine_uk_data/utils/incomes_projection.py +++ b/policyengine_uk_data/utils/incomes_projection.py @@ -5,12 +5,10 @@ import warnings from policyengine_uk import Microsimulation from microcalibrate import Calibration -from policyengine_uk_data.datasets.spi import SPI_FISCAL_YEAR, SPI_H5_FILENAME +from policyengine_uk_data.datasets import SPI_2020_21 warnings.filterwarnings("ignore") -SPI_DATASET = str(STORAGE_FOLDER / SPI_H5_FILENAME) - tax_benefit = pd.read_csv(STORAGE_FOLDER / "tax_benefit.csv") tax_benefit["name"] = tax_benefit["name"].apply(lambda x: f"obr/{x}") demographics = pd.read_csv(STORAGE_FOLDER / "demographics.csv") @@ -80,13 +78,10 @@ def create_target_matrix( incomes = pd.read_csv(STORAGE_FOLDER / "incomes.csv") for variable in REWEIGHT_VARIABLES: incomes[variable + "_count"] = uprate_values( - incomes[variable + "_count"], - "household_weight", - SPI_FISCAL_YEAR, - time_period, + incomes[variable + "_count"], "household_weight", 2021, time_period ) incomes[variable + "_amount"] = uprate_values( - incomes[variable + "_amount"], variable, SPI_FISCAL_YEAR, time_period + incomes[variable + "_amount"], variable, 2021, time_period ) for i, row in incomes.iterrows(): @@ -148,10 +143,10 @@ def get_loss_results(dataset, time_period, reform=None): def create_income_projections(): - loss_matrix, targets_array = create_target_matrix(SPI_DATASET, SPI_FISCAL_YEAR) + loss_matrix, targets_array = create_target_matrix(SPI_2020_21, 2022) - sim = Microsimulation(dataset=SPI_DATASET) - household_weights = sim.calculate("household_weight", SPI_FISCAL_YEAR).values + sim = Microsimulation(dataset=SPI_2020_21) + household_weights = sim.calculate("household_weight", 2022).values calibration = Calibration( weights=household_weights, @@ -163,8 +158,8 @@ def create_income_projections(): calibration.calibrate() reweighted_weights = calibration.weights - sim = Microsimulation(dataset=SPI_DATASET) - sim.set_input("household_weight", SPI_FISCAL_YEAR, reweighted_weights) + sim = Microsimulation(dataset=SPI_2020_21) + sim.set_input("household_weight", 2022, reweighted_weights) incomes = pd.read_csv(STORAGE_FOLDER / "incomes.csv") @@ -172,7 +167,7 @@ def create_income_projections(): lower_bounds = incomes.total_income_lower_bound upper_bounds = incomes.total_income_upper_bound - for year in range(SPI_FISCAL_YEAR, 2030): + for year in range(2022, 2030): year_df = pd.DataFrame() year_df["total_income_lower_bound"] = lower_bounds year_df["total_income_upper_bound"] = upper_bounds From 48e3e0cd7381fc8dd7a6332a1fc4e5523edc2f1d Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sat, 23 May 2026 22:16:01 -0400 Subject: [PATCH 4/9] Fix legacy SPI projection import --- policyengine_uk_data/tests/test_spi_build.py | 6 ++++++ policyengine_uk_data/utils/incomes_projection.py | 3 ++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/policyengine_uk_data/tests/test_spi_build.py b/policyengine_uk_data/tests/test_spi_build.py index c78b3aea6..a506c3101 100644 --- a/policyengine_uk_data/tests/test_spi_build.py +++ b/policyengine_uk_data/tests/test_spi_build.py @@ -211,6 +211,12 @@ def test_income_model_cache_is_release_scoped(): assert INCOME_MODEL_PATH.name == f"income_{SPI_RELEASE_NAME}.pkl" +def test_income_projection_imports_legacy_refresh_dataset(): + from policyengine_uk_data.utils import incomes_projection + + assert incomes_projection.SPI_2020_21.endswith("spi_2020.h5") + + def test_income_model_cache_rejects_stale_spi_release(tmp_path, monkeypatch): from policyengine_uk_data.datasets.imputations import income as income_module diff --git a/policyengine_uk_data/utils/incomes_projection.py b/policyengine_uk_data/utils/incomes_projection.py index 9edb92f12..b5fd45241 100644 --- a/policyengine_uk_data/utils/incomes_projection.py +++ b/policyengine_uk_data/utils/incomes_projection.py @@ -5,10 +5,11 @@ import warnings from policyengine_uk import Microsimulation from microcalibrate import Calibration -from policyengine_uk_data.datasets import SPI_2020_21 warnings.filterwarnings("ignore") +SPI_2020_21 = str(STORAGE_FOLDER / "spi_2020.h5") + tax_benefit = pd.read_csv(STORAGE_FOLDER / "tax_benefit.csv") tax_benefit["name"] = tax_benefit["name"].apply(lambda x: f"obr/{x}") demographics = pd.read_csv(STORAGE_FOLDER / "demographics.csv") From 1b0f76b1a723ff3cb30b30272cf68ae534dc725d Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sat, 23 May 2026 22:26:55 -0400 Subject: [PATCH 5/9] Make SPI projections rebuildable from current release --- policyengine_uk_data/tests/test_spi_build.py | 46 +++++++++++++++++- .../utils/incomes_projection.py | 47 +++++++++++++++---- 2 files changed, 82 insertions(+), 11 deletions(-) diff --git a/policyengine_uk_data/tests/test_spi_build.py b/policyengine_uk_data/tests/test_spi_build.py index a506c3101..566e6005d 100644 --- a/policyengine_uk_data/tests/test_spi_build.py +++ b/policyengine_uk_data/tests/test_spi_build.py @@ -211,10 +211,52 @@ def test_income_model_cache_is_release_scoped(): assert INCOME_MODEL_PATH.name == f"income_{SPI_RELEASE_NAME}.pkl" -def test_income_projection_imports_legacy_refresh_dataset(): +def test_income_projection_uses_current_spi_release(): from policyengine_uk_data.utils import incomes_projection + from policyengine_uk_data.datasets.spi import SPI_FISCAL_YEAR, SPI_H5_FILENAME - assert incomes_projection.SPI_2020_21.endswith("spi_2020.h5") + assert incomes_projection.SPI_DATASET.endswith(SPI_H5_FILENAME) + assert incomes_projection.SPI_FISCAL_YEAR == SPI_FISCAL_YEAR + + +def test_income_projection_builds_current_spi_dataset_when_missing( + tmp_path, + monkeypatch, +): + from policyengine_uk_data.utils import incomes_projection + + tab_dir = tmp_path / "spi_2022_23" + tab_dir.mkdir() + tab_path = tab_dir / "put2223uk.tab" + tab_path.write_text("fake tab") + + calls = {} + + class FakeDataset: + def save(self, path): + calls["saved_path"] = path + path.write_text("fake h5") + + def fake_create_spi(path, fiscal_year): + calls["tab_path"] = path + calls["fiscal_year"] = fiscal_year + return FakeDataset() + + monkeypatch.setattr(incomes_projection, "STORAGE_FOLDER", tmp_path) + monkeypatch.setattr(incomes_projection, "SPI_RELEASE_NAME", "spi_2022_23") + monkeypatch.setattr(incomes_projection, "SPI_TAB_FILENAME", "put2223uk.tab") + monkeypatch.setattr(incomes_projection, "SPI_H5_FILENAME", "spi_2022_23.h5") + monkeypatch.setattr(incomes_projection, "SPI_FISCAL_YEAR", 2022) + monkeypatch.setattr(incomes_projection, "create_spi", fake_create_spi) + + dataset_path = incomes_projection.ensure_spi_dataset() + + assert dataset_path == str(tmp_path / "spi_2022_23.h5") + assert calls == { + "tab_path": tab_path, + "fiscal_year": 2022, + "saved_path": tmp_path / "spi_2022_23.h5", + } def test_income_model_cache_rejects_stale_spi_release(tmp_path, monkeypatch): diff --git a/policyengine_uk_data/utils/incomes_projection.py b/policyengine_uk_data/utils/incomes_projection.py index b5fd45241..1f9dbd5bf 100644 --- a/policyengine_uk_data/utils/incomes_projection.py +++ b/policyengine_uk_data/utils/incomes_projection.py @@ -5,10 +5,35 @@ import warnings from policyengine_uk import Microsimulation from microcalibrate import Calibration +from policyengine_uk_data.datasets.spi import ( + SPI_FISCAL_YEAR, + SPI_H5_FILENAME, + SPI_RELEASE_NAME, + SPI_TAB_FILENAME, + create_spi, +) warnings.filterwarnings("ignore") -SPI_2020_21 = str(STORAGE_FOLDER / "spi_2020.h5") +SPI_DATASET = str(STORAGE_FOLDER / SPI_H5_FILENAME) + + +def ensure_spi_dataset() -> str: + """Create the SPI H5 projection input from the current TAB release if needed.""" + dataset_path = STORAGE_FOLDER / SPI_H5_FILENAME + if dataset_path.exists(): + return str(dataset_path) + + tab_path = STORAGE_FOLDER / SPI_RELEASE_NAME / SPI_TAB_FILENAME + if not tab_path.exists(): + raise FileNotFoundError( + f"Missing SPI TAB file for projections: {tab_path}. " + "Run make download before refreshing income projections." + ) + + create_spi(tab_path, SPI_FISCAL_YEAR).save(dataset_path) + return str(dataset_path) + tax_benefit = pd.read_csv(STORAGE_FOLDER / "tax_benefit.csv") tax_benefit["name"] = tax_benefit["name"].apply(lambda x: f"obr/{x}") @@ -79,10 +104,13 @@ def create_target_matrix( incomes = pd.read_csv(STORAGE_FOLDER / "incomes.csv") for variable in REWEIGHT_VARIABLES: incomes[variable + "_count"] = uprate_values( - incomes[variable + "_count"], "household_weight", 2021, time_period + incomes[variable + "_count"], + "household_weight", + SPI_FISCAL_YEAR, + time_period, ) incomes[variable + "_amount"] = uprate_values( - incomes[variable + "_amount"], variable, 2021, time_period + incomes[variable + "_amount"], variable, SPI_FISCAL_YEAR, time_period ) for i, row in incomes.iterrows(): @@ -144,10 +172,11 @@ def get_loss_results(dataset, time_period, reform=None): def create_income_projections(): - loss_matrix, targets_array = create_target_matrix(SPI_2020_21, 2022) + spi_dataset = ensure_spi_dataset() + loss_matrix, targets_array = create_target_matrix(spi_dataset, SPI_FISCAL_YEAR) - sim = Microsimulation(dataset=SPI_2020_21) - household_weights = sim.calculate("household_weight", 2022).values + sim = Microsimulation(dataset=spi_dataset) + household_weights = sim.calculate("household_weight", SPI_FISCAL_YEAR).values calibration = Calibration( weights=household_weights, @@ -159,8 +188,8 @@ def create_income_projections(): calibration.calibrate() reweighted_weights = calibration.weights - sim = Microsimulation(dataset=SPI_2020_21) - sim.set_input("household_weight", 2022, reweighted_weights) + sim = Microsimulation(dataset=spi_dataset) + sim.set_input("household_weight", SPI_FISCAL_YEAR, reweighted_weights) incomes = pd.read_csv(STORAGE_FOLDER / "incomes.csv") @@ -168,7 +197,7 @@ def create_income_projections(): lower_bounds = incomes.total_income_lower_bound upper_bounds = incomes.total_income_upper_bound - for year in range(2022, 2030): + for year in range(SPI_FISCAL_YEAR, 2030): year_df = pd.DataFrame() year_df["total_income_lower_bound"] = lower_bounds year_df["total_income_upper_bound"] = upper_bounds From 2dda55e24a375f238c6f00aa2b350ae077ccb53e Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sat, 23 May 2026 22:41:54 -0400 Subject: [PATCH 6/9] Regenerate SPI income projections from 2022-23 --- .../storage/incomes_projection.csv | 224 +++++++++--------- policyengine_uk_data/tests/test_spi_build.py | 73 ++++++ .../utils/incomes_projection.py | 35 ++- 3 files changed, 215 insertions(+), 117 deletions(-) diff --git a/policyengine_uk_data/storage/incomes_projection.csv b/policyengine_uk_data/storage/incomes_projection.csv index 9ddf68a1c..5c9399aab 100644 --- a/policyengine_uk_data/storage/incomes_projection.csv +++ b/policyengine_uk_data/storage/incomes_projection.csv @@ -1,113 +1,113 @@ total_income_lower_bound,total_income_upper_bound,employment_income_count,employment_income_amount,self_employment_income_count,self_employment_income_amount,state_pension_count,state_pension_amount,private_pension_income_count,private_pension_income_amount,property_income_count,property_income_amount,savings_interest_income_count,savings_interest_income_amount,dividend_income_count,dividend_income_amount,year -12570,15000.0,1617313,21188248221,418271,4413467024,1172836,11826062936,1280904,7133494286,123235,915876662,930742,75292588,87090,114735958,2022 -15000,20000.0,4014377,67829340021,710977,8781241045,1933393,20074524141,2157907,18643993417,263525,2117911173,1982684,295609788,409517,1294016065,2022 -20000,30000.0,7638005,182995057109,1007725,16805476353,2007068,20555725192,2426387,34918926059,466492,4542955025,3134286,553976946,849159,4506035072,2022 -30000,40000.0,4812367,160342105127,589889,12992143892,859074,8751501824,1154939,23672423318,367090,4194733040,2013275,408383811,670784,6000584630,2022 -40000,50000.0,3006948,123149855217,346450,9080447336,402000,4094287302,594931,15062584091,321211,4083433533,1291407,307220534,620441,11071531331,2022 -50000,70000.0,2630421,140304691170,274622,8245838410,289939,2943023709,456486,14259907455,345406,5209269108,1127049,321084056,560827,13085662134,2022 -70000,100000.0,1271798,95761122529,129824,5446310728,121174,1273645535,191935,8117995849,199996,3667590937,581997,224268287,310898,10218921999,2022 -100000,150000.0,597630,64680084778,80650,5095131003,48942,534003641,75172,4299979467,117448,2333040513,234468,173374315,197751,7003732097,2022 -150000,200000.0,206654,31836669131,38233,3912105875,13620,167514400,22631,1606409311,44907,1017860070,89484,84351723,80052,3648385291,2022 -200000,300000.0,141418,30272381674,30994,4654653927,10896,212997356,15755,1297059502,33925,978464377,71443,95614320,63005,3870353880,2022 -300000,500000.0,73471,23710330164,18593,4658553470,6747,170643744,8347,1009244028,19441,729769248,43753,86329166,40609,3621278715,2022 -500000,1000000.0,40063,21986754528,13955,6604960307,3610,69570380,4026,515985187,10904,507445127,27446,107022725,27414,4415149392,2022 -1000000,inf,20945,39783464634,9926,21651235668,1151,13592774,1863,321927859,7027,446636040,16075,236121715,18622,16942014951,2022 -12570,inf,26071355,1003838434401,3670108,112341565037,6870448,70687092936,8391284,130859929828,2320606,30744984854,11544052,2968649630,3936170,85792401513,2022 -12570,15000.0,843123,11236073687,261924,2985312131,573270,6295209438,613494,3281912515,66533,523610843,481988,45095368,48755,68304803,2023 -15000,20000.0,2812617,44769791351,674970,8556479810,1833835,20625771473,2067113,16105512445,241655,2017275760,1712442,250305968,329578,1028052357,2023 -20000,30000.0,7325103,167321241244,971036,16168318506,2308559,26070153401,2710874,36780181679,438386,4398726735,3236246,578420205,783261,3784124613,2023 -30000,40000.0,5128594,157730546071,695222,14694800813,1123161,12600652842,1376781,27777000009,377434,4465903127,2105449,451095854,669749,6077939465,2023 -40000,50000.0,3585416,142165783416,401701,10893258804,485785,5459480154,688282,17671918002,313829,4169001683,1378695,334807019,582446,8582336855,2023 -50000,70000.0,3366673,168287105436,321676,9580842849,385986,4320363936,584437,18438376023,402500,6183111336,1412532,404071882,724464,17470410298,2023 -70000,100000.0,1690128,121506914967,148372,6228131128,154292,1756844069,237542,10118944849,220047,4250260072,672094,262587416,324847,10550076980,2023 -100000,150000.0,820537,83596472226,87106,5434939632,63210,744044615,95262,5588728202,136946,2929430041,314427,199229173,225869,9164272261,2023 -150000,200000.0,258145,38533753818,41781,4079902702,16644,214246932,26169,1935242857,50791,1181180429,96639,93980776,92334,3745070823,2023 -200000,300000.0,182638,37334672184,35325,5155059876,12875,243655890,18571,1598760207,38889,1127704046,81779,109999197,73121,4546897775,2023 -300000,500000.0,93522,29355059934,20478,5066696208,7316,181113784,9649,1213864960,22037,921337520,49639,102052560,45056,3993942809,2023 -500000,1000000.0,45500,23723048298,14441,6949979474,4337,87176896,4345,609054214,11711,562976372,30920,120440660,30813,5243819452,2023 -1000000,inf,25026,46374174408,10973,23601745566,1799,22223116,1970,350074947,8060,501453198,17674,258929932,19793,18453370935,2023 -12570,inf,26164351,1071171820250,3685005,119395467498,6971069,78620936546,8434490,141469570911,2328820,33231971162,11590524,3211016010,3950084,92708619426,2023 -12570,15000.0,533323,7108150056,181449,2146373087,318843,3627100876,392586,2458451295,56098,480899728,307255,31427626,35167,54572598,2024 -15000,20000.0,2616662,41080651005,658600,8404482195,1837300,21890115761,2072252,15245919881,232087,1942395036,1671822,235641911,304358,965592535,2024 -20000,30000.0,6910022,155050979947,966351,16044292521,2450769,29599550079,2819527,36444949890,433085,4298115216,3242869,576059894,776200,3666496936,2024 -30000,40000.0,5536028,168485102530,744641,15787662561,1232873,14882183445,1447542,28424432829,384940,4483893690,2150119,451459994,677042,6042908705,2024 -40000,50000.0,3795995,150938282334,423608,11848046987,530409,6390505804,724365,18261455269,310500,4097348122,1435137,345331530,591159,8417973689,2024 -50000,70000.0,3538290,176975142910,352253,10638395867,425589,5105092076,614948,18986728206,419010,6334560538,1502228,415014261,755087,17722018286,2024 -70000,100000.0,1812347,130174043505,157796,6752036852,168984,2049531762,248186,10368669686,225940,4300728261,699635,267516303,329432,10633733432,2024 -100000,150000.0,911929,94055996648,90741,5842428900,68236,858377967,99582,5673183821,140586,2972219639,351336,203284531,228764,9210141628,2024 -150000,200000.0,265977,39936021254,42779,4128145235,17254,235282632,26878,1945439421,52905,1195296237,99106,94210484,96346,3803256247,2024 -200000,300000.0,190733,38995907971,37099,5439994469,13694,258685161,18743,1559425398,40727,1148105719,84588,111242513,75692,4515505248,2024 -300000,500000.0,101684,31868820026,21893,5414970584,8001,198064668,10401,1286357007,23048,937077747,52706,102365266,46658,4003559094,2024 -500000,1000000.0,47851,24856320702,14636,6922573453,4499,96329669,4431,613692305,12197,557586016,32424,121486755,32430,5274912663,2024 -1000000,inf,26610,49461753821,11698,25262632224,1808,23556353,2017,350963301,8300,515042622,18357,259886480,20086,18483842623,2024 -12570,inf,26287452,1108987172709,3703545,124632034936,7078259,85214376252,8481458,141619668309,2339423,33263268570,11647583,3214927548,3968421,92794513683,2024 -12570,15000.0,397648,5431425863,95411,1131586102,149252,1819054391,173827,1103045453,28496,256585748,186798,19019331,20214,32835476,2025 -15000,20000.0,2455344,38471462065,646589,8310950387,1722945,21134674396,1941485,13854551564,213849,1846825315,1542858,207239148,262236,804064757,2025 -20000,30000.0,6725705,150473893962,957759,15956448525,2539152,31746979013,2881443,36593475414,421314,4318144012,3203190,595655533,745073,3449453705,2025 -30000,40000.0,5647922,171143600713,779962,16624354851,1315160,16645151834,1546958,30582606095,390418,4664022006,2219894,482176525,669779,5929346090,2025 -40000,50000.0,3934646,157158903741,439525,12395050887,620689,7760893290,806552,20606229440,303104,4179167617,1510217,366118786,582376,7675351842,2025 -50000,70000.0,3671529,182822780485,393018,12219483438,490784,6142528842,694332,21835031578,442584,6763455075,1589617,461844071,806153,19186649128,2025 -70000,100000.0,1899912,135434403223,173116,7474939194,201154,2529853014,287701,12365283613,248359,4953264063,753133,300700046,360245,11683965831,2025 -100000,150000.0,979383,100335533705,96760,6165403881,79730,1033748119,113892,6683888427,153787,3447777837,381372,230334975,250211,10722457328,2025 -150000,200000.0,273418,41252853110,44425,4247215925,19764,273331393,29302,2208024937,55501,1318172084,102156,102129091,98364,3713878618,2025 -200000,300000.0,199891,40283209266,39127,5776288282,15919,294767447,20817,1828740697,43519,1278760742,89232,119757299,82376,5263725031,2025 -300000,500000.0,106487,33164209023,23341,5738915302,8618,208972717,11131,1386264892,24591,1093724569,55350,118095641,49147,4346410755,2025 -500000,1000000.0,51274,26547428136,14926,6938820193,4886,105852482,4906,765803956,13062,576174766,33775,130803986,34241,5792578949,2025 -1000000,inf,26864,50644887128,12613,27021501681,1844,24614709,2074,373252939,8546,579418267,19187,276368912,20632,19789092756,2025 -12570,inf,26370022,1133164590419,3716570,130000958647,7169895,89720421646,8514422,150186199003,2347129,35275492101,11686779,3410243344,3981047,98389810267,2025 -12570,15000.0,284401,4003725725,17732,147078443,74923,957123578,60306,172132464,6604,46028865,101103,9244587,9311,15965643,2026 -15000,20000.0,2345431,36962242079,636642,8335236226,1588079,19960655075,1800032,12854418041,203802,1844088763,1440144,187088952,232587,689039295,2026 -20000,30000.0,6454559,144192679119,950130,15889059441,2584328,33206640892,2898235,36072630396,411206,4266507687,3142405,598498547,713869,3235052416,2026 -30000,40000.0,5865632,178418051066,807993,17524784854,1371469,17854558698,1642304,32840548892,389091,4817007673,2281036,506808784,663984,5781178872,2026 -40000,50000.0,4022262,161997848269,448463,12705589813,677493,8689817475,873563,22644657779,299408,4156841468,1572801,389404670,577926,7215036289,2026 -50000,70000.0,3698018,186591828331,430830,14058712441,534362,6873284739,745781,23880808783,439976,6938840004,1640515,495656920,784042,17059739216,2026 -70000,100000.0,2047654,141923191246,194535,8284719831,224621,2898156689,322403,13981337392,286911,5808618841,816189,339447195,443917,15677599277,2026 -100000,150000.0,1048299,107045075157,102029,6554381944,89675,1187900462,128334,7677174131,163385,3794166555,418397,250145698,268543,12047927482,2026 -150000,200000.0,285262,43112513157,45828,4326447835,22323,313750528,32581,2533732358,58802,1437466364,105241,108378597,102973,3925199748,2026 -200000,300000.0,208035,41735442414,41421,6088523896,16649,275664226,22139,2035423258,46858,1419157659,94096,130133744,87051,5742374807,2026 -300000,500000.0,110460,34323794901,24798,6213730757,8479,227652293,11078,1292535705,25931,1222514565,56799,119094155,50510,4556981255,2026 -500000,1000000.0,55570,28414611732,16457,7545643026,6358,144108876,6459,1071200071,13957,622353008,36771,151500018,37839,6422119621,2026 -1000000,inf,27446,52360639402,12931,28503616591,1853,25262935,2154,392409201,8660,608407839,19516,290278066,20801,20768336340,2026 -12570,inf,26453030,1161081642598,3729790,136177525098,7200611,92614576467,8545369,157449008472,2354590,36981999292,11725012,3575679933,3993352,103136550262,2026 -12570,15000.0,157614,2226514997,11419,115298820,43049,546722056,33845,99112819,2617,22017366,56940,6193902,4771,8853075,2027 -15000,20000.0,2223214,35402364648,555145,7456997250,1357334,17485274910,1498280,10713990174,164372,1557574943,1250664,149720201,174605,429100441,2027 -20000,30000.0,6203414,138834762178,934623,15659944901,2570589,34042587726,2844152,34572598129,388886,4206868786,3017719,604153765,685734,3108597813,2027 -30000,40000.0,6080671,186203956673,828891,18267316336,1470708,19622929846,1790904,36579876310,386545,4946268425,2376429,542023924,639039,5321395421,2027 -40000,50000.0,4098137,166944727639,460863,13163090850,749099,9869438240,950052,25056491329,297294,4323473061,1641086,421578492,568035,6606459754,2027 -50000,70000.0,3770816,190477817490,469723,15842128343,604335,7986055099,845496,27984743447,458723,7652871105,1748588,563132781,820562,18187476247,2027 -70000,100000.0,2161470,148735998930,219621,9247004089,266348,3513593021,378448,16986140600,316390,6614210731,879024,393824692,495114,17800024336,2027 -100000,150000.0,1106276,112820968990,111000,7045651031,104304,1406770159,149362,9282047900,176753,4401898570,458591,279132462,287755,12864985783,2027 -150000,200000.0,312758,45988801397,49699,4493773309,28000,398892466,39467,3224267421,68129,1802993891,115191,144288878,121151,5665751055,2027 -200000,300000.0,216456,43278263991,44346,6528420120,17254,261553597,24309,2362919534,49781,1616057609,98486,138771405,91377,6165331079,2027 -300000,500000.0,118643,36152473383,26718,6691546980,10106,247770448,13442,1705796458,28050,1013600098,61441,130553805,55996,5304615303,2027 -500000,1000000.0,58957,30020975132,17416,7959157397,7543,198583368,6720,1213391884,15675,1134790642,39848,177287001,40553,7535489345,2027 -1000000,inf,28098,54231566808,13541,30341716033,1865,25972647,2230,426153729,9060,687499196,19947,315639880,21238,22480167023,2027 -12570,inf,26536523,1191319192256,3743005,142812045458,7230490,95605545353,8576662,170207253774,2362275,39980124425,11763909,3866292782,4005930,111478246674,2027 -12570,15000.0,43250,547260875,9518,109572877,33003,411490857,25697,85668640,2203,20131951,20837,4681719,3510,6595012,2028 -15000,20000.0,2113708,34203074274,473365,6525113371,1135871,15013231022,1183587,8042139364,118061,1116652605,1067959,115069303,125690,263334619,2028 -20000,30000.0,5866955,131326932205,919684,15635037381,2546834,34753144224,2849394,34922424472,378287,4246272743,2907344,600277416,649167,2957600481,2028 -30000,40000.0,6291548,194764752870,836628,18874924300,1506390,20610618588,1810232,37158850467,373451,4934259286,2381332,571898095,623578,5097188919,2028 -40000,50000.0,4196513,173682520026,462999,13006979729,811666,10980804880,1026718,27432820140,293261,4503277220,1715980,451631209,547909,6201165846,2028 -50000,70000.0,3817546,192916128409,515705,17901307821,700674,9520368873,977036,33564695948,479924,8393640211,1860701,639502562,860946,19442323963,2028 -70000,100000.0,2262917,156196112423,245648,10431247111,308775,4185404952,437158,20153419279,338461,7430477103,954880,449563463,525723,18956824053,2028 -100000,150000.0,1174062,119171478189,119507,7580696445,121093,1663874523,172912,11130437531,193911,5155472105,501729,317819060,313038,14572185744,2028 -150000,200000.0,333372,48599629156,53366,4669919524,33264,482743102,46419,4029874927,75314,2146127250,124035,161469945,133793,6870049292,2028 -200000,300000.0,226873,45159925456,48221,7097630842,20081,305793949,28288,2970663268,53572,1833254339,105386,155861314,98485,6811024520,2028 -300000,500000.0,124941,38034305310,28261,7055972186,10727,255249853,14026,1827977373,29646,1147236805,64378,132018899,59273,5836785467,2028 -500000,1000000.0,61681,30668522455,18268,8368871254,8284,219026825,7762,1510238989,17570,1361971160,42423,222153751,43429,8765575489,2028 -1000000,inf,29803,57247064345,14240,32288400530,1870,26608901,2307,460217073,9331,765355789,20772,342170594,22064,24250691013,2028 -12570,inf,26543169,1222517705993,3745409,149545673373,7238518,98428186821,8581522,183289207202,2362992,43054128567,11767744,4164106845,4006604,120031344418,2028 -12570,15000.0,41322,522174791,9136,106481624,31865,399334404,24278,81593683,2067,19628044,19690,4421706,4083,7788425,2029 -15000,20000.0,1959559,32427629478,370332,5133019076,976260,13082968505,931291,5590351923,80459,699610104,914889,93930277,97080,190565674,2029 -20000,30000.0,5514866,123647055299,942571,16356359966,2505759,34384205939,2847065,35994008007,364185,4319608486,2775386,581926805,597147,2670847802,2029 -30000,40000.0,6496707,204600695976,808977,18507573314,1507572,20704038655,1763629,36142154559,348427,4781269141,2357532,596070159,600272,4678714220,2029 -40000,50000.0,4259677,178736924469,482111,13612310733,854265,11643337828,1113634,30549560706,304316,4902300404,1785116,495175585,538919,6211204554,2029 -50000,70000.0,3836680,195159795735,550918,19636501335,785823,10706908774,1077388,38119827813,474834,8564045062,1943138,687994382,856363,18916410903,2029 -70000,100000.0,2376795,163646937322,277978,11854892360,355513,4837458772,511127,24159428650,372588,8556670378,1040324,519081890,583141,21426450769,2029 -100000,150000.0,1244473,125716702018,132842,8291630742,142941,1969988420,204496,13605664626,216724,6097252826,554298,371811545,347205,16653158370,2029 -150000,200000.0,357143,51827208117,56289,4870213248,38940,554057655,53551,4880091840,81308,2505862349,134156,185312566,147335,8263579353,2029 -200000,300000.0,232665,46215797479,51648,7581090662,22792,344277284,32038,3517051694,57995,2122204759,110881,176906927,103700,7356179584,2029 -300000,500000.0,132179,39994880890,30191,7479900857,11530,262058418,15230,2221574216,32519,1404396444,68507,145572226,63558,6564150570,2029 -500000,1000000.0,65167,32071479900,19830,9366495732,9284,244865427,8919,1730353411,18375,1451646032,44872,245561015,45612,9234666542,2029 -1000000,inf,31452,59862029985,14649,33904055509,1878,26717273,2394,497368158,9895,873603983,21767,374324090,23610,26883847881,2029 -12570,inf,26548685,1254429311460,3747472,156700525157,7244421,99160217356,8585040,197089029286,2363691,46298098012,11770556,4478089174,4008026,129057564647,2029 +12570,15000.0,1578124,19579833586,407614,4379192739,1219423,11426466381,1382995,6884598300,148097,1048901471,1337832,309574000,254747,504371433,2022 +15000,20000.0,3909966,62715685925,675494,8286536208,1946066,19078022730,2222185,16193853359,323528,2503127415,2664966,664430574,491636,1250802231,2022 +20000,30000.0,7320669,167537680555,960689,15898598172,2042991,19825665439,2495547,31063307877,495253,4627781046,4161127,1068362563,879564,4048983448,2022 +30000,40000.0,4580533,144842513925,569655,12429253804,933740,8959798797,1243917,22689483682,379651,4129576301,2621129,771559127,707807,5327593672,2022 +40000,50000.0,2891875,113374465070,337735,8731015854,427508,4102788280,627879,14125975424,333347,4022947133,1635029,599435745,641382,9616772420,2022 +50000,70000.0,2527815,127337068615,263670,7999064428,324238,3127154626,492408,14220360749,360628,5045751459,1400096,678967078,585411,12079979388,2022 +70000,100000.0,1226351,87894784885,123807,5416208899,135100,1335895204,207989,8158732868,208495,3632297608,701813,519665468,318244,9612017761,2022 +100000,150000.0,626269,65409755996,74186,5467431696,65607,703576525,95314,5350425811,129125,2464128049,297278,445352480,220476,6838629489,2022 +150000,200000.0,217620,32303225516,35862,4102785247,18284,210079106,27909,1898587208,49895,1161052667,113500,233992663,87565,3531788132,2022 +200000,300000.0,147842,30221527236,29179,4861124722,12824,145158095,18227,1534276590,38560,1090538560,87513,262476670,68599,3658469589,2022 +300000,500000.0,78172,24583703333,19402,4842254825,6029,64914036,10734,1091037466,23241,677824107,54147,209381689,45443,3536941269,2022 +500000,1000000.0,33813,14916871650,29075,4737287689,2105,7476782,24571,364127999,29728,313341414,31585,157478774,32201,2955462940,2022 +1000000,inf,17391,25858436258,16273,18921283498,1249,1499087,13731,200568070,16397,289893101,16642,391826549,16770,7951697829,2022 +12570,inf,24975567,911500714174,3513428,105385524497,7126013,68893793060,8848191,123516596114,2528842,30894288721,15115591,6298161492,4313537,69637976885,2022 +12570,15000.0,1700948,21197582236,432364,4661912112,1146705,11400333614,1323651,6335839718,137702,999182716,1321215,304757664,214830,373157462,2023 +15000,20000.0,3210785,50812827549,649078,8607910153,1962769,21047879943,2205152,14735349801,278133,2211096160,2427788,613147054,444772,1115420546,2023 +20000,30000.0,7340792,169878056114,904772,16264359538,2140682,23013393168,2570897,30434271906,454992,4239940457,4197075,1076350407,824267,3491928437,2023 +30000,40000.0,5147057,164340480752,544899,12822183602,1016454,10757653530,1347715,23873002794,357158,3892790802,2874647,790891707,694962,4841357403,2023 +40000,50000.0,3345211,135934263635,307044,8621106283,455382,4818165439,672871,14620050064,301164,3564626144,1870702,592341267,559281,6290675801,2023 +50000,70000.0,3442050,174170985746,275553,8842739208,374815,3941518725,596292,16087106975,413482,5558500591,1863877,780568893,743023,15790683750,2023 +70000,100000.0,1589170,117027531321,125901,6067806045,142918,1530216357,235913,8566432661,223308,3751225677,898688,536400140,312682,8248778302,2023 +100000,150000.0,808534,83364170438,81121,6041757797,65167,750237049,101991,5081211793,149368,2666905927,376443,433890806,255365,8878335572,2023 +150000,200000.0,270971,40501252342,36691,4165167148,17842,226934446,29769,1809126948,57590,1205790275,130946,223546861,99037,3961987269,2023 +200000,300000.0,182715,37735823028,29112,4792835394,11952,151856544,18889,1420014138,43042,1086977961,99725,254651034,76881,4033842444,2023 +300000,500000.0,91659,29235774405,18318,4976786971,6063,73895312,9181,1021260276,23785,747190963,59267,205724156,48648,3779960464,2023 +500000,1000000.0,52772,23984860241,49010,7218633418,3049,8654576,41077,557785716,49212,502660236,51168,247865805,51633,4036471327,2023 +1000000,inf,27083,36060095890,26729,23345807142,2009,2123426,22444,339151965,26628,458145785,26837,539867292,26879,11281315750,2023 +12570,inf,27209735,1084242693907,3480591,116429004811,7345807,77722862129,9175842,124880604755,2515564,30885033696,16198379,6600003086,4352249,76123368240,2023 +12570,15000.0,1630203,20461247283,436711,4749491996,1069968,11139040541,1233699,5566811713,128993,968475567,1231495,281439041,192181,327325733,2024 +15000,20000.0,3088830,48710368041,654099,8660025265,2056305,23570314696,2294780,14249377073,268552,2166462229,2421006,619581215,440732,1093116537,2024 +20000,30000.0,7065803,163851678851,927145,16675265876,2328561,27279242503,2749143,31190998803,458071,4332679306,4184930,1112053989,807079,3362187179,2024 +30000,40000.0,5369179,171915190557,563651,13345121415,1134517,13068050810,1464276,25820858531,355804,3994950073,2996883,846159218,687535,4614106753,2024 +40000,50000.0,3536122,144312034572,320430,9027204438,527338,6060168777,758413,16557639406,297054,3537206746,1975498,634627472,570977,6101490351,2024 +50000,70000.0,3829069,195055120580,306067,9823628363,440206,5035819590,687575,18708781711,445401,6096771601,2096608,857031557,819889,17062768406,2024 +70000,100000.0,1768975,130719471791,135650,6341374567,166119,1923993398,273461,10051790691,240184,4103931602,997747,585632172,332393,8599155084,2024 +100000,150000.0,914434,94576484979,86791,6328974068,73887,918540261,117054,5881030843,161560,2960956261,442787,467853836,271294,9764229114,2024 +150000,200000.0,303091,45579014580,38765,4357999109,20343,281333631,33382,2094392517,63114,1326119507,144029,255944262,107012,4252908253,2024 +200000,300000.0,204502,42516117641,31635,5145207799,13226,181679984,20917,1593543809,46927,1199197739,109108,270244207,83847,4331930208,2024 +300000,500000.0,101411,32476137901,19131,5244119713,6695,90878299,10288,1174700253,25993,830413101,64231,226718677,51868,4014504189,2024 +500000,1000000.0,57280,26941481011,48900,7287947107,3319,14013533,40955,656953648,49582,544394485,53826,255648903,53757,4332046824,2024 +1000000,inf,28936,39261527302,28568,24658409963,2156,2742107,23922,383656024,28455,499133268,28666,582478915,28751,12008510824,2024 +12570,inf,27897834,1156375875090,3597544,121644769679,7842638,89565818129,9707865,133930535022,2569689,32560691483,16746812,6995413464,4447315,79864279455,2024 +12570,15000.0,1564312,19754666807,439490,4818445845,1028456,10788463744,1187178,5346714471,120902,944491730,1174882,272202724,176846,313931582,2025 +15000,20000.0,2987477,47058712401,658288,8748182090,2050873,23735410673,2291116,14178280164,261903,2156098506,2372021,625910396,435490,1056335392,2025 +20000,30000.0,6717087,156413120605,930396,16732871714,2381675,28406679873,2791108,31504958877,454182,4405341769,4062798,1152394114,779010,3180802282,2025 +30000,40000.0,5498947,176773708265,569816,13532726286,1179140,13817361469,1514097,26766611341,354229,4115254678,3046986,874588595,676470,4462142845,2025 +40000,50000.0,3704123,151846037754,328074,9169750472,570091,6645629281,807733,17858553493,291110,3580463727,2049104,665492700,576152,5906096144,2025 +50000,70000.0,4212864,216487932698,324325,10136185401,479328,5569991854,752638,20496524625,464953,6554524789,2306970,946274303,881416,18151539077,2025 +70000,100000.0,1989083,147612961429,143408,6476370194,182215,2144100563,300983,11020664936,259993,4444802483,1098144,618630496,351551,9070025732,2025 +100000,150000.0,1030937,107054586442,90635,6487561147,82978,1037457986,133962,6669001196,175228,3373300740,515731,526542941,289875,10841452071,2025 +150000,200000.0,336313,50906968782,40501,4449153411,21993,306412401,36561,2342695657,67433,1420719371,154924,272731593,114266,4538275864,2025 +200000,300000.0,229330,47998827168,32754,5209986685,14313,200048848,22824,1739076467,51312,1341024270,120004,294189729,90762,4695058555,2025 +300000,500000.0,113115,36515267353,19975,5396350684,7303,99873421,11367,1279264319,28289,925436656,69586,248384313,56049,4322296224,2025 +500000,1000000.0,60991,29631709001,47644,7023892933,3523,18698083,39997,736869438,49319,598340823,55503,266028883,54952,4596558870,2025 +1000000,inf,31204,42971111467,30797,25445025063,2310,2896579,25844,418685966,30683,547696496,30897,629503151,31004,12886486433,2025 +12570,inf,28475772,1231024919576,3656103,123626501925,8004199,92773024775,9915408,140357900948,2609537,34407496039,17057550,7392873939,4513834,84020107617,2025 +12570,15000.0,1435299,18145065332,460781,5195002481,947057,10184968813,1084887,4708970823,113272,907025329,1059315,243807063,156462,286543995,2026 +15000,20000.0,2935666,46127004722,662387,8768658951,2028869,24301503785,2275401,13679842621,250762,2095392660,2328346,622458086,425028,1008487504,2026 +20000,30000.0,6453160,150037448615,942282,17092011180,2514547,31684093324,2919372,31836338466,451887,4426105691,4042085,1169461979,756581,2995283832,2026 +30000,40000.0,5607005,180385476453,584836,13970117479,1259568,15550497517,1587374,27918051697,353033,4170789414,3101857,904804083,673701,4410474822,2026 +40000,50000.0,3796473,155737066941,343134,9624502986,629326,7738487301,874997,19525710554,290237,3655784027,2113407,709193639,577563,5730552928,2026 +50000,70000.0,4470000,230713135826,347103,11026592176,530087,6504278720,823899,22702943577,476807,6790234628,2460702,992802121,926753,18830019732,2026 +70000,100000.0,2156088,159880006409,154123,6903924580,205216,2530126160,335713,12421464100,277757,4786847255,1190722,668122642,372134,9516296174,2026 +100000,150000.0,1117016,116344493915,95481,6781942763,92627,1208236556,148095,7474141939,185768,3666460890,571735,574878908,303670,11577817001,2026 +150000,200000.0,357842,54267429736,41983,4533170535,23856,351307913,39812,2609810201,71068,1529244073,162757,284393876,118929,4739070203,2026 +200000,300000.0,247193,51903228914,34605,5540213803,15497,224937007,24534,1923322569,54353,1420284038,128132,309386676,95994,4885702883,2026 +300000,500000.0,121303,39130137495,21261,5743667352,7900,115360337,12583,1422427387,30206,992124564,73398,267833581,58987,4584642992,2026 +500000,1000000.0,64296,31846664927,46888,7072752755,3847,24628257,39072,813652951,49090,646429146,57366,272825100,56084,4745251666,2026 +1000000,inf,32941,45581833608,32528,26728457788,2361,3188226,27447,459229149,32392,583627668,32638,660610794,32746,13530989929,2026 +12570,inf,28794283,1280098992893,3767392,128981014829,8260757,100421613915,10193187,147495906033,2636631,35670349382,17322462,7680578548,4554632,86841133659,2026 +12570,15000.0,1417744,17924557619,456223,5268240632,902870,9890869036,1016218,4173281911,107702,884580224,1000807,231559043,147036,269573981,2027 +15000,20000.0,2877369,45262410784,665932,8835401862,1985863,24258060413,2246521,13397848053,246176,2096683541,2285560,621064157,414481,993200615,2027 +20000,30000.0,6303227,146635389874,944873,17166220071,2597728,33837752721,2994586,31910321745,448541,4500315542,4008092,1198768102,736194,2817610833,2027 +30000,40000.0,5634552,181387315611,605685,14631612714,1316623,16870561326,1649469,28811923534,348704,4159042204,3154250,935166392,668041,4423560538,2027 +40000,50000.0,3865393,158624568522,365584,10441629414,678529,8633795836,931471,20952636761,293361,3790507988,2162188,727054401,579077,5536000720,2027 +50000,70000.0,4660547,241274296758,368997,11979068923,570961,7233812112,873176,24315447161,487232,7031437965,2562613,1052706952,961963,19497132607,2027 +70000,100000.0,2287387,169566214811,165049,7437752301,223108,2841298323,364587,13621975302,292809,5172955986,1265396,721974752,390930,10042416315,2027 +100000,150000.0,1179574,122821070671,100669,7108206329,100891,1354468098,160086,8181848075,194108,3936281446,615673,605256300,315404,12341609382,2027 +150000,200000.0,375542,57001400135,43619,4722895028,25782,386151570,42248,2805635573,74108,1634675731,170266,307447116,123442,4865297920,2027 +200000,300000.0,263169,55246964661,36955,5919088238,16476,248752383,26413,2107190454,57333,1507256898,134719,324662651,100752,5200892167,2027 +300000,500000.0,127567,41097136207,22318,5998433218,8492,128041069,13434,1522212761,31930,1095391949,77243,291675813,61959,4859301476,2027 +500000,1000000.0,66626,33310661499,46730,7424720474,4077,28252634,38425,884188442,48931,684054262,58710,278754617,57070,5000048324,2027 +1000000,inf,34473,47731185615,34047,28177452877,2381,3413110,28647,489336269,33855,620378796,34124,694375428,34221,14164278302,2027 +12570,inf,29093171,1317883172766,3856682,135110722079,8433781,105715228630,10385281,153173846041,2664791,37113562532,17529639,7990465725,4590569,90010923180,2027 +12570,15000.0,1308053,16477688515,443180,5202852800,847959,9350215069,944820,3789892747,101718,854086396,930942,223862655,134694,246463957,2028 +15000,20000.0,2912188,45857364022,682435,9122350676,1949323,24182651182,2220537,13212216347,238697,2067187088,2262659,612697739,402598,970577667,2028 +20000,30000.0,6128224,142526726821,944976,17277701674,2657028,35435873657,3047205,31895434340,447472,4558338562,3977074,1230121291,725205,2714242048,2028 +30000,40000.0,5664727,182454403522,623402,15169531571,1355974,17829739790,1692573,29470889218,347688,4204445438,3167861,967637219,658569,4330154986,2028 +40000,50000.0,3955893,162349186603,380351,11070892257,724994,9443667397,981383,22290679605,294057,3901544609,2221734,753245015,580958,5396778362,2028 +50000,70000.0,4802981,249113050840,394249,13069042787,610501,7937253084,926899,26041336060,495619,7291630560,2653874,1109025303,990193,20176077805,2028 +70000,100000.0,2420071,179218365491,177412,8060378086,243122,3174869726,393437,14891185546,306995,5508849588,1339807,773717767,410619,10553231686,2028 +100000,150000.0,1242298,129477533694,105613,7421346612,107311,1467993755,171385,8837456311,201974,4209549393,655275,637600614,327671,13062015115,2028 +150000,200000.0,390604,59328061916,45408,4971998595,27618,416578778,44975,3053847039,77524,1758125367,177397,318703971,127546,5027526195,2028 +200000,300000.0,277455,58250635935,39204,6250710831,17821,276781073,28302,2279568572,60120,1622344064,141385,343174722,105841,5507272251,2028 +300000,500000.0,133430,42889822205,23401,6329025223,8963,139816880,14137,1609139717,33589,1154587335,80431,315160786,64264,5093929456,2028 +500000,1000000.0,68712,34472102504,46339,7721660169,4328,32660564,37770,976737110,48486,736464358,60019,283061893,57972,5244292604,2028 +1000000,inf,36310,50024381087,35792,29700962715,2458,3659881,29979,524392631,35600,661588837,35891,732328989,35988,14872180954,2028 +12570,inf,29340945,1352439323154,3941763,141368453997,8557400,109691760835,10533403,158872775244,2689538,38528741595,17704350,8300337963,4622117,93194743087,2028 +12570,15000.0,1268746,16013479043,425795,5071351269,790159,8776754694,886075,3589958831,95632,811466817,888000,211207472,125453,231175910,2029 +15000,20000.0,2900120,45730270810,696659,9387835835,1923254,24193504975,2190060,12874202234,235755,2105099955,2217722,613399264,390486,956528179,2029 +20000,30000.0,5947831,138084912841,948297,17440443110,2707402,36974630685,3090900,31842985801,444544,4612875945,3939497,1247889145,717119,2671641425,2029 +30000,40000.0,5726587,184646565561,635374,15558753156,1396335,18830651712,1737440,30081780580,345308,4219052067,3196514,997732893,654896,4243650640,2029 +40000,50000.0,4016929,165017177348,404872,12006784158,772021,10313136057,1033120,23631491392,292096,3943388008,2270522,779374469,574183,5188482262,2029 +50000,70000.0,4947632,257001302612,412939,13971233115,650029,8663847391,977304,27669864745,505650,7566030928,2748205,1163911210,1016093,20772076550,2029 +70000,100000.0,2556240,189231958784,192107,8759936980,261458,3501575546,418974,16007677742,322025,5913106508,1410766,828871948,432776,11029644085,2029 +100000,150000.0,1302079,135534651949,111055,7844769968,115728,1606086422,184890,9628346000,210088,4457458557,697105,675222400,339747,13877748010,2029 +150000,200000.0,411829,62591278967,46390,5045983197,29768,456378184,47987,3310145060,80644,1880435052,184576,334174839,131779,5235480980,2029 +200000,300000.0,290882,61022507921,41945,6712804838,19006,306795032,30561,2505420739,63136,1736150017,148322,361530373,111198,5825638124,2029 +300000,500000.0,139412,44796677014,24130,6542574681,9380,147188564,14720,1679726822,34827,1232718169,83135,328114313,66492,5280872233,2029 +500000,1000000.0,71701,36125335032,46668,8220936133,4570,39739899,37367,1066748998,48896,788938135,62034,302775541,59451,5566145909,2029 +1000000,inf,37651,51986001088,37114,31182866351,2578,3871198,31184,554228838,36911,700688408,37229,764753903,37322,15528559086,2029 +12570,inf,29617638,1387782118972,4023346,147746272792,8681687,113814160359,10680582,164442577782,2715512,39967408567,17883628,8608957772,4656995,96407643393,2029 diff --git a/policyengine_uk_data/tests/test_spi_build.py b/policyengine_uk_data/tests/test_spi_build.py index 566e6005d..8a4e6b4e5 100644 --- a/policyengine_uk_data/tests/test_spi_build.py +++ b/policyengine_uk_data/tests/test_spi_build.py @@ -217,6 +217,7 @@ def test_income_projection_uses_current_spi_release(): assert incomes_projection.SPI_DATASET.endswith(SPI_H5_FILENAME) assert incomes_projection.SPI_FISCAL_YEAR == SPI_FISCAL_YEAR + assert "savings_interest_income" in incomes_projection.ALL_INCOME_VARIABLES def test_income_projection_builds_current_spi_dataset_when_missing( @@ -248,6 +249,7 @@ def fake_create_spi(path, fiscal_year): monkeypatch.setattr(incomes_projection, "SPI_H5_FILENAME", "spi_2022_23.h5") monkeypatch.setattr(incomes_projection, "SPI_FISCAL_YEAR", 2022) monkeypatch.setattr(incomes_projection, "create_spi", fake_create_spi) + monkeypatch.setattr(incomes_projection, "_read_spi_dataset_year", lambda path: 2022) dataset_path = incomes_projection.ensure_spi_dataset() @@ -259,6 +261,77 @@ def fake_create_spi(path, fiscal_year): } +def test_income_projection_rebuilds_stale_spi_dataset_year( + tmp_path, + monkeypatch, +): + from policyengine_uk_data.utils import incomes_projection + + tab_dir = tmp_path / "spi_2022_23" + tab_dir.mkdir() + (tab_dir / "put2223uk.tab").write_text("fake tab") + dataset_path = tmp_path / "spi_2022_23.h5" + dataset_path.write_text("stale h5") + + read_years = iter([2026, 2022]) + calls = {} + + class FakeDataset: + def save(self, path): + calls["saved_path"] = path + path.write_text("rebuilt h5") + + monkeypatch.setattr(incomes_projection, "STORAGE_FOLDER", tmp_path) + monkeypatch.setattr(incomes_projection, "SPI_RELEASE_NAME", "spi_2022_23") + monkeypatch.setattr(incomes_projection, "SPI_TAB_FILENAME", "put2223uk.tab") + monkeypatch.setattr(incomes_projection, "SPI_H5_FILENAME", "spi_2022_23.h5") + monkeypatch.setattr(incomes_projection, "SPI_FISCAL_YEAR", 2022) + monkeypatch.setattr( + incomes_projection, + "_read_spi_dataset_year", + lambda path: next(read_years), + ) + monkeypatch.setattr( + incomes_projection, + "create_spi", + lambda path, fiscal_year: FakeDataset(), + ) + + assert incomes_projection.ensure_spi_dataset() == str(dataset_path) + assert calls == {"saved_path": dataset_path} + assert dataset_path.read_text() == "rebuilt h5" + + +def test_income_projection_loads_local_h5_dataset(monkeypatch): + from policyengine_uk_data.utils import incomes_projection + + calls = {} + + class FakeDataset: + def __init__(self, path): + calls["path"] = path + self.household = pd.DataFrame( + {"region": ["UNKNOWN", "LONDON", "SOUTH_EAST"]} + ) + + monkeypatch.setattr( + incomes_projection, + "ensure_spi_dataset", + lambda: "/tmp/spi_2022_23.h5", + ) + monkeypatch.setattr(incomes_projection, "UKSingleYearDataset", FakeDataset) + + dataset = incomes_projection.load_spi_dataset() + + assert isinstance(dataset, FakeDataset) + assert calls == {"path": "/tmp/spi_2022_23.h5"} + assert dataset.household["region"].tolist() == [ + "SOUTH_EAST", + "LONDON", + "SOUTH_EAST", + ] + + def test_income_model_cache_rejects_stale_spi_release(tmp_path, monkeypatch): from policyengine_uk_data.datasets.imputations import income as income_module diff --git a/policyengine_uk_data/utils/incomes_projection.py b/policyengine_uk_data/utils/incomes_projection.py index 1f9dbd5bf..f3ebfb188 100644 --- a/policyengine_uk_data/utils/incomes_projection.py +++ b/policyengine_uk_data/utils/incomes_projection.py @@ -4,6 +4,7 @@ from policyengine_uk_data.utils import uprate_values import warnings from policyengine_uk import Microsimulation +from policyengine_uk.data import UKSingleYearDataset from microcalibrate import Calibration from policyengine_uk_data.datasets.spi import ( SPI_FISCAL_YEAR, @@ -18,10 +19,18 @@ SPI_DATASET = str(STORAGE_FOLDER / SPI_H5_FILENAME) +def _read_spi_dataset_year(dataset_path) -> int: + with pd.HDFStore(dataset_path) as store: + return int(store["time_period"].iloc[0]) + + def ensure_spi_dataset() -> str: """Create the SPI H5 projection input from the current TAB release if needed.""" dataset_path = STORAGE_FOLDER / SPI_H5_FILENAME - if dataset_path.exists(): + if ( + dataset_path.exists() + and _read_spi_dataset_year(dataset_path) == SPI_FISCAL_YEAR + ): return str(dataset_path) tab_path = STORAGE_FOLDER / SPI_RELEASE_NAME / SPI_TAB_FILENAME @@ -32,9 +41,23 @@ def ensure_spi_dataset() -> str: ) create_spi(tab_path, SPI_FISCAL_YEAR).save(dataset_path) + dataset_year = _read_spi_dataset_year(dataset_path) + if dataset_year != SPI_FISCAL_YEAR: + raise ValueError( + f"Built SPI dataset {dataset_path} for {dataset_year}, " + f"expected {SPI_FISCAL_YEAR}." + ) return str(dataset_path) +def load_spi_dataset() -> UKSingleYearDataset: + dataset = UKSingleYearDataset(ensure_spi_dataset()) + dataset.household["region"] = dataset.household["region"].replace( + {"UNKNOWN": "SOUTH_EAST"} + ) + return dataset + + tax_benefit = pd.read_csv(STORAGE_FOLDER / "tax_benefit.csv") tax_benefit["name"] = tax_benefit["name"].apply(lambda x: f"obr/{x}") demographics = pd.read_csv(STORAGE_FOLDER / "demographics.csv") @@ -62,6 +85,7 @@ def ensure_spi_dataset() -> str: "state_pension", "private_pension_income", "property_income", + "savings_interest_income", "dividend_income", ] @@ -172,10 +196,11 @@ def get_loss_results(dataset, time_period, reform=None): def create_income_projections(): - spi_dataset = ensure_spi_dataset() - loss_matrix, targets_array = create_target_matrix(spi_dataset, SPI_FISCAL_YEAR) + loss_matrix, targets_array = create_target_matrix( + load_spi_dataset(), SPI_FISCAL_YEAR + ) - sim = Microsimulation(dataset=spi_dataset) + sim = Microsimulation(dataset=load_spi_dataset()) household_weights = sim.calculate("household_weight", SPI_FISCAL_YEAR).values calibration = Calibration( @@ -188,7 +213,7 @@ def create_income_projections(): calibration.calibrate() reweighted_weights = calibration.weights - sim = Microsimulation(dataset=spi_dataset) + sim = Microsimulation(dataset=load_spi_dataset()) sim.set_input("household_weight", SPI_FISCAL_YEAR, reweighted_weights) incomes = pd.read_csv(STORAGE_FOLDER / "incomes.csv") From 19da1aad8a12d7aa336b432b02a8b4dfe230c756 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sat, 23 May 2026 22:49:26 -0400 Subject: [PATCH 7/9] Read SPI projection metadata without write access --- policyengine_uk_data/tests/test_spi_build.py | 30 +++++++++++++++++++ .../utils/incomes_projection.py | 2 +- 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/policyengine_uk_data/tests/test_spi_build.py b/policyengine_uk_data/tests/test_spi_build.py index 8a4e6b4e5..096e533fd 100644 --- a/policyengine_uk_data/tests/test_spi_build.py +++ b/policyengine_uk_data/tests/test_spi_build.py @@ -220,6 +220,36 @@ def test_income_projection_uses_current_spi_release(): assert "savings_interest_income" in incomes_projection.ALL_INCOME_VARIABLES +def test_income_projection_reads_spi_dataset_year_read_only(monkeypatch): + from policyengine_uk_data.utils import incomes_projection + + calls = {} + + class FakeStore: + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + return None + + def __getitem__(self, key): + assert key == "time_period" + return pd.Series([2022]) + + def fake_hdf_store(path, mode=None): + calls["path"] = path + calls["mode"] = mode + return FakeStore() + + monkeypatch.setattr(incomes_projection.pd, "HDFStore", fake_hdf_store) + + assert incomes_projection._read_spi_dataset_year("/readonly/spi_2022_23.h5") == 2022 + assert calls == { + "path": "/readonly/spi_2022_23.h5", + "mode": "r", + } + + def test_income_projection_builds_current_spi_dataset_when_missing( tmp_path, monkeypatch, diff --git a/policyengine_uk_data/utils/incomes_projection.py b/policyengine_uk_data/utils/incomes_projection.py index f3ebfb188..57213f47b 100644 --- a/policyengine_uk_data/utils/incomes_projection.py +++ b/policyengine_uk_data/utils/incomes_projection.py @@ -20,7 +20,7 @@ def _read_spi_dataset_year(dataset_path) -> int: - with pd.HDFStore(dataset_path) as store: + with pd.HDFStore(dataset_path, mode="r") as store: return int(store["time_period"].iloc[0]) From 7f4d8d89b235febfaa80473b31e3d749ba36ff67 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sat, 23 May 2026 23:22:53 -0400 Subject: [PATCH 8/9] Reduce SPI income model sample in test builds --- .../datasets/imputations/income.py | 11 ++++++++++- policyengine_uk_data/tests/test_spi_build.py | 16 ++++++++++++++++ 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/policyengine_uk_data/datasets/imputations/income.py b/policyengine_uk_data/datasets/imputations/income.py index a3c829f16..b1b543f39 100644 --- a/policyengine_uk_data/datasets/imputations/income.py +++ b/policyengine_uk_data/datasets/imputations/income.py @@ -8,6 +8,7 @@ import pandas as pd import numpy as np +import os from policyengine_uk_data.storage import STORAGE_FOLDER from policyengine_uk.data import UKSingleYearDataset from policyengine_uk import Microsimulation @@ -131,6 +132,14 @@ def generate_spi_table( "imputations": tuple(IMPUTATIONS), } INCOME_MODEL_PATH = STORAGE_FOLDER / f"income_{SPI_RELEASE_NAME}.pkl" +INCOME_MODEL_SAMPLE_SIZE = 100_000 +TESTING_INCOME_MODEL_SAMPLE_SIZE = 10_000 + + +def get_income_model_sample_size() -> int: + if os.environ.get("TESTING", "0") == "1": + return TESTING_INCOME_MODEL_SAMPLE_SIZE + return INCOME_MODEL_SAMPLE_SIZE def _income_model_matches_current_release(model) -> bool: @@ -153,7 +162,7 @@ def save_imputation_models(): income = QRF() income.metadata = INCOME_MODEL_METADATA spi = pd.read_csv(SPI_TAB_FOLDER / SPI_TAB_FILENAME, delimiter="\t") - spi = generate_spi_table(spi) + spi = generate_spi_table(spi, sample_size=get_income_model_sample_size()) spi = spi[PREDICTORS + IMPUTATIONS] income.fit(spi[PREDICTORS], spi[IMPUTATIONS]) income.save(INCOME_MODEL_PATH) diff --git a/policyengine_uk_data/tests/test_spi_build.py b/policyengine_uk_data/tests/test_spi_build.py index 096e533fd..1a35fc374 100644 --- a/policyengine_uk_data/tests/test_spi_build.py +++ b/policyengine_uk_data/tests/test_spi_build.py @@ -211,6 +211,22 @@ def test_income_model_cache_is_release_scoped(): assert INCOME_MODEL_PATH.name == f"income_{SPI_RELEASE_NAME}.pkl" +def test_income_model_sample_size_is_reduced_in_testing(monkeypatch): + from policyengine_uk_data.datasets.imputations import income as income_module + + monkeypatch.delenv("TESTING", raising=False) + assert ( + income_module.get_income_model_sample_size() + == income_module.INCOME_MODEL_SAMPLE_SIZE + ) + + monkeypatch.setenv("TESTING", "1") + assert ( + income_module.get_income_model_sample_size() + == income_module.TESTING_INCOME_MODEL_SAMPLE_SIZE + ) + + def test_income_projection_uses_current_spi_release(): from policyengine_uk_data.utils import incomes_projection from policyengine_uk_data.datasets.spi import SPI_FISCAL_YEAR, SPI_H5_FILENAME From 8b542cd197898b7f0e6eac77a3c61936a24bf845 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sat, 23 May 2026 23:26:48 -0400 Subject: [PATCH 9/9] Track SPI income model sample size in cache metadata --- .../datasets/imputations/income.py | 11 ++++- policyengine_uk_data/tests/test_spi_build.py | 45 ++++++++++++++++--- 2 files changed, 49 insertions(+), 7 deletions(-) diff --git a/policyengine_uk_data/datasets/imputations/income.py b/policyengine_uk_data/datasets/imputations/income.py index b1b543f39..763a632ec 100644 --- a/policyengine_uk_data/datasets/imputations/income.py +++ b/policyengine_uk_data/datasets/imputations/income.py @@ -142,8 +142,15 @@ def get_income_model_sample_size() -> int: return INCOME_MODEL_SAMPLE_SIZE +def get_income_model_metadata() -> dict: + return { + **INCOME_MODEL_METADATA, + "sample_size": get_income_model_sample_size(), + } + + def _income_model_matches_current_release(model) -> bool: - if getattr(model, "metadata", {}) != INCOME_MODEL_METADATA: + if getattr(model, "metadata", {}) != get_income_model_metadata(): return False cached_outputs = set(getattr(model.model, "imputed_variables", [])) @@ -160,7 +167,7 @@ def save_imputation_models(): from policyengine_uk_data.utils import QRF income = QRF() - income.metadata = INCOME_MODEL_METADATA + income.metadata = get_income_model_metadata() spi = pd.read_csv(SPI_TAB_FOLDER / SPI_TAB_FILENAME, delimiter="\t") spi = generate_spi_table(spi, sample_size=get_income_model_sample_size()) spi = spi[PREDICTORS + IMPUTATIONS] diff --git a/policyengine_uk_data/tests/test_spi_build.py b/policyengine_uk_data/tests/test_spi_build.py index 1a35fc374..efb37f810 100644 --- a/policyengine_uk_data/tests/test_spi_build.py +++ b/policyengine_uk_data/tests/test_spi_build.py @@ -219,12 +219,20 @@ def test_income_model_sample_size_is_reduced_in_testing(monkeypatch): income_module.get_income_model_sample_size() == income_module.INCOME_MODEL_SAMPLE_SIZE ) + assert ( + income_module.get_income_model_metadata()["sample_size"] + == income_module.INCOME_MODEL_SAMPLE_SIZE + ) monkeypatch.setenv("TESTING", "1") assert ( income_module.get_income_model_sample_size() == income_module.TESTING_INCOME_MODEL_SAMPLE_SIZE ) + assert ( + income_module.get_income_model_metadata()["sample_size"] + == income_module.TESTING_INCOME_MODEL_SAMPLE_SIZE + ) def test_income_projection_uses_current_spi_release(): @@ -383,7 +391,7 @@ def test_income_model_cache_rejects_stale_spi_release(tmp_path, monkeypatch): cache = tmp_path / "income_spi_2022_23.pkl" stale_metadata = { - **income_module.INCOME_MODEL_METADATA, + **income_module.get_income_model_metadata(), "spi_release_name": "spi_2020_21", "spi_tab_filename": "put2021uk.tab", } @@ -406,10 +414,39 @@ def test_income_model_cache_rejects_stale_spi_release(tmp_path, monkeypatch): assert income_module.create_income_model() is sentinel +def test_income_model_cache_rejects_stale_sample_size(tmp_path, monkeypatch): + from policyengine_uk_data.datasets.imputations import income as income_module + + monkeypatch.delenv("TESTING", raising=False) + cache = tmp_path / "income_spi_2022_23.pkl" + stale_metadata = { + **income_module.get_income_model_metadata(), + "sample_size": income_module.TESTING_INCOME_MODEL_SAMPLE_SIZE, + } + with cache.open("wb") as f: + pickle.dump( + { + "model": SimpleNamespace( + imputed_variables=list(income_module.IMPUTATIONS) + ), + "input_columns": income_module.PREDICTORS, + "metadata": stale_metadata, + }, + f, + ) + + sentinel = object() + monkeypatch.setattr(income_module, "INCOME_MODEL_PATH", cache) + monkeypatch.setattr(income_module, "save_imputation_models", lambda: sentinel) + + assert income_module.create_income_model() is sentinel + + def test_income_model_cache_accepts_current_spi_release(tmp_path, monkeypatch): from policyengine_uk_data.datasets.imputations import income as income_module cache = tmp_path / "income_spi_2022_23.pkl" + current_metadata = income_module.get_income_model_metadata() with cache.open("wb") as f: pickle.dump( { @@ -417,7 +454,7 @@ def test_income_model_cache_accepts_current_spi_release(tmp_path, monkeypatch): imputed_variables=list(income_module.IMPUTATIONS) ), "input_columns": income_module.PREDICTORS, - "metadata": income_module.INCOME_MODEL_METADATA, + "metadata": current_metadata, }, f, ) @@ -429,6 +466,4 @@ def test_income_model_cache_accepts_current_spi_release(tmp_path, monkeypatch): lambda: pytest.fail("current SPI release cache should be reused"), ) - assert income_module.create_income_model().metadata == ( - income_module.INCOME_MODEL_METADATA - ) + assert income_module.create_income_model().metadata == current_metadata