diff --git a/policyengine_uk_data/targets/sources.yaml b/policyengine_uk_data/targets/sources.yaml index 0e8e56293..e6fa75c6c 100644 --- a/policyengine_uk_data/targets/sources.yaml +++ b/policyengine_uk_data/targets/sources.yaml @@ -7,8 +7,8 @@ obr: vintage: "march_2026" hmrc: - spi_collated: "https://assets.publishing.service.gov.uk/media/67cabb37ade26736dbf9ffe5/Collated_Tables_3_1_to_3_17_2223.ods" - spi_geography: "https://assets.publishing.service.gov.uk/media/67cabb7f8c1076c796a45bec/Collated_Tables_3_12_to_3_15a_2223.ods" + spi_collated: "https://assets.publishing.service.gov.uk/media/69f1f12d2fae53a03709682f/Collated_Tables_3_1_to_3_11_2324.ods" + spi_geography: "https://assets.publishing.service.gov.uk/media/69f1f17cc42061e837e3ac3b/Collated_Tables_3_12_to_3_15a_2324.ods" income_tax_liabilities: "https://www.gov.uk/government/statistics/income-tax-liabilities-statistics-tax-year-2022-to-2023-to-tax-year-2025-to-2026" salary_sacrifice_table_6: "https://assets.publishing.service.gov.uk/media/687a294e312ee8a5f0806b6d/Tables_6_1_and_6_2.csv" diff --git a/policyengine_uk_data/targets/sources/hmrc_spi.py b/policyengine_uk_data/targets/sources/hmrc_spi.py index d2c0eeba0..64aeaf3f3 100644 --- a/policyengine_uk_data/targets/sources/hmrc_spi.py +++ b/policyengine_uk_data/targets/sources/hmrc_spi.py @@ -1,7 +1,7 @@ """HMRC Survey of Personal Incomes targets. Downloads and parses the SPI ODS (Tables 3.6 and 3.7) to get income -distributions by total income band and income type for 2022-23. +distributions by total income band and income type for 2023-24. For future year projections, the microsimulation uprates these base year distributions forward using PolicyEngine's uprating factors. @@ -54,8 +54,8 @@ ] _BAND_UPPER = _BAND_LOWER[1:] + [float("inf")] -# SPI year: the ODS is for tax year 2022-23, mapped to calendar 2023 -_SPI_YEAR = 2023 +# SPI year: the ODS is for tax year 2023-24, mapped to calendar 2024 +_SPI_YEAR = 2024 # HMRC Property Rental Income Statistics show ~1.9x more property income # than the SPI (£46.68bn vs £24.5bn for 2020-21), because SPI only covers diff --git a/policyengine_uk_data/targets/sources/ons_demographics.py b/policyengine_uk_data/targets/sources/ons_demographics.py index dba77671d..ffac427c6 100644 --- a/policyengine_uk_data/targets/sources/ons_demographics.py +++ b/policyengine_uk_data/targets/sources/ons_demographics.py @@ -1,6 +1,6 @@ """ONS population projections and demographic targets. -Downloads the ONS 2022-based principal population projection for the +Downloads the ONS 2024-based principal population projection for the UK to extract total population and gender × age band targets. For regional age breakdowns (12 regions × 9 age bands), reads the @@ -36,7 +36,7 @@ _UK_ZIP_URL = ( "https://www.ons.gov.uk/file?uri=/peoplepopulationandcommunity/" "populationandmigration/populationprojections/datasets/" - "z1zippedpopulationprojectionsdatafilesuk/2022based/uk.zip" + "z1zippedpopulationprojectionsdatafilesuk/2024based/uk.zip" ) _REF_REGION = ( @@ -81,7 +81,8 @@ def _download_uk_projection() -> pd.DataFrame: r = requests.get(_UK_ZIP_URL, headers=HEADERS, allow_redirects=True, timeout=120) r.raise_for_status() z = zipfile.ZipFile(io.BytesIO(r.content)) - with z.open("uk/uk_ppp_machine_readable.xlsx") as f: + projection_member = _find_projection_member(z.namelist()) + with z.open(projection_member) as f: df = pd.read_excel( io.BytesIO(f.read()), sheet_name="Population", @@ -90,22 +91,42 @@ def _download_uk_projection() -> pd.DataFrame: return df +def _find_projection_member(names: list[str]) -> str: + """Find the UK principal projection workbook inside the ONS zip.""" + for name in names: + if name.endswith("uk_ppp_machine_readable.xlsx"): + return name + raise RuntimeError( + "ONS UK projection zip did not contain uk_ppp_machine_readable.xlsx" + ) + + def _aggregate_ages( df: pd.DataFrame, sex: str, low: int, high: int, years: list[int] ) -> dict[int, float]: """Sum population for a sex and age range across years.""" sex_filter = "Females" if sex == "female" else "Males" - mask = (df["Sex"] == sex_filter) & ( - df["Age"].apply(lambda a: isinstance(a, int) and low <= a <= high) - ) + ages = pd.to_numeric(df["Age"], errors="coerce") + mask = (df["Sex"] == sex_filter) & ages.between(low, high) subset = df[mask] result = {} for y in years: - if y in subset.columns: - result[y] = float(subset[y].sum()) + column = _year_column(subset, y) + if column is not None: + result[y] = float(subset[column].sum()) return result +def _year_column(df: pd.DataFrame, year: int) -> int | str | None: + """Return the workbook column for a year across ONS vintages.""" + if year in df.columns: + return year + string_year = str(year) + if string_year in df.columns: + return string_year + return None + + def _parse_uk_totals(df: pd.DataFrame) -> list[Target]: """Extract UK total population and gender × age bands.""" targets = [] @@ -113,8 +134,9 @@ def _parse_uk_totals(df: pd.DataFrame) -> list[Target]: # UK total uk_pop = {} for y in _YEARS: - if y in df.columns: - uk_pop[y] = float(df[y].sum()) + column = _year_column(df, y) + if column is not None: + uk_pop[y] = float(df[column].sum()) if uk_pop: targets.append( Target( diff --git a/policyengine_uk_data/targets/sources/ons_households.py b/policyengine_uk_data/targets/sources/ons_households.py index 9fd4b49f2..5b08bd3c2 100644 --- a/policyengine_uk_data/targets/sources/ons_households.py +++ b/policyengine_uk_data/targets/sources/ons_households.py @@ -22,7 +22,7 @@ "https://www.ons.gov.uk/file?uri=/peoplepopulationandcommunity/" "birthsdeathsandmarriages/families/datasets/" "familiesandhouseholdsfamiliesandhouseholds/" - "current/familiesandhouseholdsuk2024.xlsx" + "current/familiesandhouseholdsuk2025.xlsx" ) _REF = ( "https://www.ons.gov.uk/peoplepopulationandcommunity/" diff --git a/policyengine_uk_data/tests/test_ons_demographics_targets.py b/policyengine_uk_data/tests/test_ons_demographics_targets.py new file mode 100644 index 000000000..433bf3e25 --- /dev/null +++ b/policyengine_uk_data/tests/test_ons_demographics_targets.py @@ -0,0 +1,24 @@ +import pandas as pd +import pytest + +from policyengine_uk_data.targets.sources.ons_demographics import ( + _aggregate_ages, + _find_projection_member, +) + + +def test_aggregate_ages_accepts_string_age_values(): + df = pd.DataFrame( + { + "Sex": ["Females", "Females", "Females", "Males"], + "Age": ["14", "15", "90", "15"], + 2025: [1, 2, 4, 8], + } + ) + + assert _aggregate_ages(df, "female", 15, 90, [2025]) == {2025: 6.0} + + +def test_find_projection_member_fails_loudly(): + with pytest.raises(RuntimeError, match="uk_ppp_machine_readable"): + _find_projection_member(["uk/readme.txt"]) diff --git a/policyengine_uk_data/tests/test_population.py b/policyengine_uk_data/tests/test_population.py index 1714887ca..67fd71676 100644 --- a/policyengine_uk_data/tests/test_population.py +++ b/policyengine_uk_data/tests/test_population.py @@ -1,6 +1,6 @@ def test_population(baseline): population = baseline.calculate("people", 2025).sum() / 1e6 - POPULATION_TARGET = 69.5 # ONS 2022-based projection for 2025, millions: https://www.ons.gov.uk/peoplepopulationandcommunity/populationandmigration/populationprojections/bulletins/nationalpopulationprojections/2022based + POPULATION_TARGET = 69.5 # ONS 2024-based projection for 2025, millions: https://www.ons.gov.uk/peoplepopulationandcommunity/populationandmigration/populationprojections/bulletins/nationalpopulationprojections/2024based # Tightened from 7% to 4% after data-pipeline improvements in April 2026 # (stage-2 QRF imputation #362, TFC target refresh #363, reported-anchor # takeup #359) pulled the weighted UK population down from ~74M (+6.5%) diff --git a/policyengine_uk_data/tests/test_population_fidelity.py b/policyengine_uk_data/tests/test_population_fidelity.py index 272212516..512715152 100644 --- a/policyengine_uk_data/tests/test_population_fidelity.py +++ b/policyengine_uk_data/tests/test_population_fidelity.py @@ -16,7 +16,7 @@ import numpy as np -POPULATION_TARGET = 69.5 # ONS 2022-based projection for 2025, millions +POPULATION_TARGET = 69.5 # ONS 2024-based projection for 2025, millions TOLERANCE = 0.04 # 4% — covers ~1.6%-3.3% stochastic calibration variance diff --git a/policyengine_uk_data/tests/test_property_income_targets.py b/policyengine_uk_data/tests/test_property_income_targets.py index 0b302c64b..660b796a0 100644 --- a/policyengine_uk_data/tests/test_property_income_targets.py +++ b/policyengine_uk_data/tests/test_property_income_targets.py @@ -10,16 +10,20 @@ def test_property_income_targets_scaled(): """Property income targets should be ~1.9x the raw SPI values. - Raw SPI 2022-23 total is ~£27bn. After scaling, targets for the - base year should be ~£52bn (matching HMRC rental income stats). + Raw SPI 2023-24 total is scaled up to better match HMRC rental + income statistics, which cover more landlords than SPI. """ - targets = get_all_targets(year=2023) + base_year = 2024 + targets = get_all_targets(year=base_year) total = sum( - t.values[2023] + t.values[base_year] for t in targets - if "property_income" in t.name and "count" not in t.name and 2023 in t.values + if "property_income" in t.name + and "count" not in t.name + and base_year in t.values ) - # Raw SPI gives ~£27bn, scaled by 1.9x should give ~£52bn + # Raw SPI gives roughly half of all landlord income; scaling should + # leave the current base-year target in this broad administrative range. assert total > 45e9, ( f"Property income target total £{total / 1e9:.1f}bn is below £45bn. " "Scaling factor may not be applied."