From 53c75ebe6c2b20e2de766ddb338166dab9fe3486 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sun, 24 May 2026 00:22:33 -0400 Subject: [PATCH 1/4] Update private UK survey prerequisites --- .../datasets/imputations/consumption.py | 151 +++++++++++++----- .../datasets/imputations/services/etb.py | 42 +++-- .../datasets/imputations/vat.py | 55 +++++-- .../datasets/imputations/wealth.py | 105 +++++++----- .../datasets/private_releases.py | 79 +++++++++ .../storage/download_private_prerequisites.py | 11 +- .../tests/test_frs_prerequisites.py | 26 +++ .../tests/test_private_releases.py | 72 +++++++++ .../tests/test_road_fuel_volume_uprating.py | 17 +- .../tests/test_student_loan_balance.py | 47 ++++-- .../tests/test_vat_parameters.py | 7 + 11 files changed, 490 insertions(+), 122 deletions(-) create mode 100644 policyengine_uk_data/datasets/private_releases.py create mode 100644 policyengine_uk_data/tests/test_private_releases.py diff --git a/policyengine_uk_data/datasets/imputations/consumption.py b/policyengine_uk_data/datasets/imputations/consumption.py index 9723631c2..cb04ffb0c 100644 --- a/policyengine_uk_data/datasets/imputations/consumption.py +++ b/policyengine_uk_data/datasets/imputations/consumption.py @@ -23,12 +23,16 @@ import pandas as pd import numpy as np from policyengine_uk_data.datasets.frs_release import CURRENT_FRS_RELEASE +from policyengine_uk_data.datasets.private_releases import ( + CURRENT_LCFS_RELEASE, + CURRENT_WAS_RELEASE, +) from policyengine_uk_data.storage import STORAGE_FOLDER from policyengine_uk.data import UKSingleYearDataset from policyengine_uk import Microsimulation from policyengine_uk_data.datasets.frs import WEEKS_IN_YEAR -LCFS_TAB_FOLDER = STORAGE_FOLDER / "lcfs_2021_22" +LCFS_TAB_FOLDER = STORAGE_FOLDER / CURRENT_LCFS_RELEASE.name # Default seed for the stochastic ICE-vehicle flag drawn from # `NTS_2024_ICE_VEHICLE_SHARE`. Kept at 42 for backward compatibility with @@ -39,20 +43,31 @@ # EV/ICE vehicle mix from NTS 2024 NTS_2024_ICE_VEHICLE_SHARE = 0.90 -# DESNZ weekly road-fuel price statistics, "Data" sheet, fiscal-year average -# UK pump prices over 2021-04-01 to 2022-03-31. Data source: +# DESNZ weekly road-fuel price statistics, fiscal-year average UK pump prices. +# 2023 prices cover 2023-04-01 to 2024-03-31 for the current LCFS release. +# Data source: # https://www.data.gov.uk/dataset/21db6396-3daf-4d90-8b3f-054995256018/petrol-and-diesel-prices # LCFS records nominal fuel spending, while PolicyEngine derives litres via # ``spending / model pump price``. LCFS_FUEL_PRICE_GBP_PER_LITRE = { - "petrol_spending": {2021: 1.3890790089424998}, - "diesel_spending": {2021: 1.4291180616502566}, + "petrol_spending": { + 2021: 1.3890790089424998, + 2023: 1.4615903846153844, + }, + "diesel_spending": { + 2021: 1.4291180616502566, + 2023: 1.5348538461538461, + }, } FUEL_PRICE_PARAMETER_NAME = { "petrol_spending": "petrol", "diesel_spending": "diesel", } -CONSUMPTION_MODEL_FILENAME = "consumption_fuel_litre_proxy_2026_05.pkl" +CONSUMPTION_MODEL_FILENAME = ( + f"consumption_{CURRENT_LCFS_RELEASE.name}_{CURRENT_WAS_RELEASE.name}" + "_fuel_litre_proxy_2026_05.pkl" +) +HAS_FUEL_MODEL_FILENAME = f"has_fuel_{CURRENT_WAS_RELEASE.name}.pkl" REGIONS = { 1: "NORTH_EAST", @@ -84,7 +99,7 @@ } # LCFS A121 → FRS accommodation_type mapping -# LCFS coding inferred from LCFS 2021/22 user guide: +# LCFS coding inferred from the LCFS user guide: # 1=detached house, 2=semi-detached, 3=terraced, 4=flat (purpose-built), # 5=flat/other (converted), 6=caravan/mobile, 7=bungalow/other house, 8=other LCFS_ACCOMM_MAP = { @@ -164,6 +179,60 @@ "gas_consumption", ] +HAS_FUEL_PREDICTOR_VARIABLES = [ + "household_net_income", + "num_adults", + "num_children", + "private_pension_income", + "employment_income", + "self_employment_income", + "region", +] + + +def get_has_fuel_model_path(): + return STORAGE_FOLDER / HAS_FUEL_MODEL_FILENAME + + +def get_has_fuel_model_metadata() -> dict: + return { + "was_release_name": CURRENT_WAS_RELEASE.name, + "was_household_tab_filename": CURRENT_WAS_RELEASE.household_tab_filename, + "predictor_variables": tuple(HAS_FUEL_PREDICTOR_VARIABLES), + "impute_variables": ("has_fuel_consumption",), + "ice_vehicle_share": NTS_2024_ICE_VEHICLE_SHARE, + "seed": _HAS_FUEL_SEED, + } + + +def get_consumption_model_path(): + return STORAGE_FOLDER / CONSUMPTION_MODEL_FILENAME + + +def get_consumption_model_metadata() -> dict: + return { + "lcfs_release_name": CURRENT_LCFS_RELEASE.name, + "lcfs_household_tab_filename": CURRENT_LCFS_RELEASE.household_tab_filename, + "lcfs_person_tab_filename": CURRENT_LCFS_RELEASE.person_tab_filename, + "lcfs_fuel_price_year": CURRENT_LCFS_RELEASE.fuel_price_year, + "was_release_name": CURRENT_WAS_RELEASE.name, + "was_household_tab_filename": CURRENT_WAS_RELEASE.household_tab_filename, + "frs_base_year": CURRENT_FRS_RELEASE.base_year, + "predictor_variables": tuple(PREDICTOR_VARIABLES), + "impute_variables": tuple(IMPUTATIONS), + } + + +def _qrf_model_matches_current_metadata( + model, metadata: dict, outputs: list[str] +) -> bool: + if getattr(model, "metadata", {}) != metadata: + return False + + trained_outputs = getattr(model.model, "imputed_variables", None) + return list(trained_outputs) == outputs + + # ── NEED 2023 calibration targets ───────────────────────────────────────────── # Source: NEED 2023 headline tables (published 2025), England & Wales, ~18M dwellings. # Tables 11b/12b: mean gas/electricity kWh by income; 9b/10b by tenure; @@ -420,21 +489,27 @@ def create_has_fuel_model(): from policyengine_uk_data.utils.qrf import QRF from policyengine_uk_data.datasets.imputations.wealth import ( WAS_TAB_FOLDER, - REGIONS, + generate_was_table, ) - model_path = STORAGE_FOLDER / "has_fuel_model.pkl" + model_path = get_has_fuel_model_path() if model_path.exists(): - return QRF(file_path=model_path) + cached = QRF(file_path=model_path) + if _qrf_model_matches_current_metadata( + cached, + get_has_fuel_model_metadata(), + ["has_fuel_consumption"], + ): + return cached was = pd.read_csv( - WAS_TAB_FOLDER / "was_round_7_hhold_eul_march_2022.tab", + WAS_TAB_FOLDER / CURRENT_WAS_RELEASE.household_tab_filename, sep="\t", low_memory=False, ) - was.columns = [c.lower() for c in was.columns] + was = generate_was_table(was) - num_vehicles = was["vcarnr7"].fillna(0).clip(lower=0) + num_vehicles = was["num_vehicles"].fillna(0).clip(lower=0) has_vehicle = num_vehicles > 0 # Use a local RNG so we don't mutate the global np.random state (which # would silently change any unrelated consumer of np.random that runs @@ -444,30 +519,16 @@ def create_has_fuel_model(): has_vehicle & (rng.random(len(was)) < NTS_2024_ICE_VEHICLE_SHARE) ).astype(float) - was_df = pd.DataFrame( - { - "household_net_income": was["dvtotinc_bhcr7"], - "num_adults": was["numadultr7"], - "num_children": was["numch18r7"], - "private_pension_income": was["dvgippenr7_aggr"], - "employment_income": was["dvgiempr7_aggr"], - "self_employment_income": was["dvgiser7_aggr"], - "region": was["gorr7"].map(REGIONS), - "has_fuel_consumption": has_fuel, - } - ).dropna() + was_df = was[HAS_FUEL_PREDICTOR_VARIABLES].copy() + was_df["has_fuel_consumption"] = has_fuel + was_df = was_df.dropna() - predictors = [ - "household_net_income", - "num_adults", - "num_children", - "private_pension_income", - "employment_income", - "self_employment_income", - "region", - ] model = QRF() - model.fit(was_df[predictors], was_df[["has_fuel_consumption"]]) + model.metadata = get_has_fuel_model_metadata() + model.fit( + was_df[HAS_FUEL_PREDICTOR_VARIABLES], + was_df[["has_fuel_consumption"]], + ) model.save(model_path) return model @@ -544,7 +605,7 @@ def generate_lcfs_table(lcfs_person: pd.DataFrame, lcfs_household: pd.DataFrame) def uprate_lcfs_table(household: pd.DataFrame, time_period: str) -> pd.DataFrame: from policyengine_uk.system import system - start_period = 2021 + start_period = CURRENT_LCFS_RELEASE.fuel_price_year target_year = int(str(time_period)[:4]) for variable in FUEL_PRICE_PARAMETER_NAME: household[variable] *= fuel_spending_litre_proxy_uprating( @@ -688,27 +749,35 @@ def save_imputation_models(): from policyengine_uk_data.utils.qrf import QRF consumption = QRF() + consumption.metadata = get_consumption_model_metadata() lcfs_household = pd.read_csv( - LCFS_TAB_FOLDER / "lcfs_2021_dvhh_ukanon.tab", + LCFS_TAB_FOLDER / CURRENT_LCFS_RELEASE.household_tab_filename, delimiter="\t", low_memory=False, ) lcfs_person = pd.read_csv( - LCFS_TAB_FOLDER / "lcfs_2021_dvper_ukanon202122.tab", delimiter="\t" + LCFS_TAB_FOLDER / CURRENT_LCFS_RELEASE.person_tab_filename, + delimiter="\t", ) household = generate_lcfs_table(lcfs_person, lcfs_household) household = uprate_lcfs_table(household, str(CURRENT_FRS_RELEASE.base_year)) consumption.fit(household[PREDICTOR_VARIABLES], household[IMPUTATIONS]) - consumption.save(STORAGE_FOLDER / CONSUMPTION_MODEL_FILENAME) + consumption.save(get_consumption_model_path()) return consumption def create_consumption_model(overwrite_existing: bool = False): from policyengine_uk_data.utils.qrf import QRF - model_path = STORAGE_FOLDER / CONSUMPTION_MODEL_FILENAME + model_path = get_consumption_model_path() if model_path.exists() and not overwrite_existing: - return QRF(file_path=model_path) + cached = QRF(file_path=model_path) + if _qrf_model_matches_current_metadata( + cached, + get_consumption_model_metadata(), + IMPUTATIONS, + ): + return cached return save_imputation_models() diff --git a/policyengine_uk_data/datasets/imputations/services/etb.py b/policyengine_uk_data/datasets/imputations/services/etb.py index 31678e42c..014fbe045 100644 --- a/policyengine_uk_data/datasets/imputations/services/etb.py +++ b/policyengine_uk_data/datasets/imputations/services/etb.py @@ -7,17 +7,16 @@ import pandas as pd import numpy as np -from pathlib import Path -import logging from policyengine_uk import Microsimulation -from huggingface_hub import hf_hub_download -import os +from policyengine_uk_data.datasets.private_releases import CURRENT_ETB_RELEASE from policyengine_uk_data.storage import STORAGE_FOLDER from policyengine_uk_data.utils.qrf import QRF from policyengine_uk.data import UKSingleYearDataset # Constants WEEKS_IN_YEAR = 52 +ETB_TAB_FOLDER = STORAGE_FOLDER / CURRENT_ETB_RELEASE.name +PUBLIC_SERVICES_MODEL_FILENAME = f"public_services_{CURRENT_ETB_RELEASE.name}.pkl" # Variables used to predict public service receipt PREDICTORS = [ @@ -40,18 +39,41 @@ ] -def create_public_services_model(overwrite_existing: bool = False) -> None: +def get_public_services_model_path(): + return STORAGE_FOLDER / PUBLIC_SERVICES_MODEL_FILENAME + + +def get_public_services_model_metadata() -> dict: + return { + "etb_release_name": CURRENT_ETB_RELEASE.name, + "etb_household_tab_filename": CURRENT_ETB_RELEASE.household_tab_filename, + "predictor_variables": tuple(PREDICTORS), + "output_variables": tuple(OUTPUTS), + } + + +def _public_services_model_matches_current_release(model: QRF) -> bool: + if getattr(model, "metadata", {}) != get_public_services_model_metadata(): + return False + + trained_outputs = getattr(model.model, "imputed_variables", None) + return list(trained_outputs) == OUTPUTS + + +def create_public_services_model(overwrite_existing: bool = False) -> QRF: """ Create and save a model for imputing public service receipt values. Args: overwrite_existing: Whether to overwrite an existing model file. """ - # Check if model already exists and we're not overwriting - if (STORAGE_FOLDER / "public_services.pkl").exists() and not overwrite_existing: - return + model_path = get_public_services_model_path() + if model_path.exists() and not overwrite_existing: + cached = QRF(file_path=model_path) + if _public_services_model_matches_current_release(cached): + return cached - etb_path = STORAGE_FOLDER / "etb_1977_21" / "householdv2_1977-2021.tab" + etb_path = ETB_TAB_FOLDER / CURRENT_ETB_RELEASE.household_tab_filename # Load Effects of Taxes and Benefits (ETB) dataset etb = pd.read_csv(etb_path, delimiter="\t") @@ -102,7 +124,9 @@ def create_public_services_model(overwrite_existing: bool = False) -> None: # Train model model = QRF() + model.metadata = get_public_services_model_metadata() model.fit(X=train[PREDICTORS], y=train[OUTPUTS]) + model.save(model_path) return model diff --git a/policyengine_uk_data/datasets/imputations/vat.py b/policyengine_uk_data/datasets/imputations/vat.py index 5b30b4ed8..7d3ee9564 100644 --- a/policyengine_uk_data/datasets/imputations/vat.py +++ b/policyengine_uk_data/datasets/imputations/vat.py @@ -13,20 +13,19 @@ """ import pandas as pd -from pathlib import Path -import numpy as np +from policyengine_uk_data.datasets.private_releases import CURRENT_ETB_RELEASE from policyengine_uk_data.storage import STORAGE_FOLDER from policyengine_uk.data import UKSingleYearDataset from policyengine_uk import Microsimulation -ETB_TAB_FOLDER = STORAGE_FOLDER / "etb_1977_21" +ETB_TAB_FOLDER = STORAGE_FOLDER / CURRENT_ETB_RELEASE.name +VAT_MODEL_FILENAME = ( + f"vat_{CURRENT_ETB_RELEASE.name}_{CURRENT_ETB_RELEASE.default_training_year}.pkl" +) -# Default ETB vintage used when training the imputation model. Kept at 2020 -# for backward compatibility with the checked-in vat.pkl fingerprint, but -# exposed as a module constant rather than an inline magic number so later -# updates require only a one-line change (not scattered `etb.year == 2020` -# checks). -DEFAULT_ETB_YEAR = 2020 +# Default ETB vintage used when training the imputation model. The ETB 1977-2024 +# file uses ``year == 2023`` for financial year ending 2024. +DEFAULT_ETB_YEAR = CURRENT_ETB_RELEASE.default_training_year # Fallback VAT parameters used when `policyengine_uk` is unavailable (e.g. # unit-test environments). Values match the 2020-21 UK statutory position. @@ -40,12 +39,38 @@ VAT_RATE_BY_YEAR: dict[int, tuple[float, float]] = { 2020: (0.2, 0.03), 2021: (0.2, 0.03), + 2022: (0.2, 0.03), + 2023: (0.2, 0.03), } PREDICTORS = ["is_adult", "is_child", "is_SP_age", "household_net_income"] IMPUTATIONS = ["full_rate_vat_expenditure_rate"] +def get_vat_model_path(year: int = DEFAULT_ETB_YEAR): + if year == DEFAULT_ETB_YEAR: + return STORAGE_FOLDER / VAT_MODEL_FILENAME + return STORAGE_FOLDER / f"vat_{CURRENT_ETB_RELEASE.name}_{year}.pkl" + + +def get_vat_model_metadata(year: int = DEFAULT_ETB_YEAR) -> dict: + return { + "etb_release_name": CURRENT_ETB_RELEASE.name, + "etb_household_tab_filename": CURRENT_ETB_RELEASE.household_tab_filename, + "training_year": year, + "predictor_variables": tuple(PREDICTORS), + "impute_variables": tuple(IMPUTATIONS), + } + + +def _vat_model_matches_current_release(model, year: int = DEFAULT_ETB_YEAR) -> bool: + if getattr(model, "metadata", {}) != get_vat_model_metadata(year): + return False + + trained_outputs = getattr(model.model, "imputed_variables", None) + return list(trained_outputs) == IMPUTATIONS + + def _get_vat_parameters(year: int) -> tuple[float, float]: """Return ``(standard_rate, reduced_rate_share)`` for the given calendar year. @@ -106,15 +131,16 @@ def save_imputation_models(year: int = DEFAULT_ETB_YEAR): from policyengine_uk_data.utils.qrf import QRF vat = QRF() + vat.metadata = get_vat_model_metadata(year) etb = pd.read_csv( - ETB_TAB_FOLDER / "householdv2_1977-2021.tab", + ETB_TAB_FOLDER / CURRENT_ETB_RELEASE.household_tab_filename, delimiter="\t", low_memory=False, ) etb = generate_etb_table(etb, year=year) etb = etb[PREDICTORS + IMPUTATIONS] vat.fit(etb[PREDICTORS], etb[IMPUTATIONS]) - vat.save(STORAGE_FOLDER / "vat.pkl") + vat.save(get_vat_model_path(year)) return vat @@ -130,8 +156,11 @@ def create_vat_model(overwrite_existing: bool = False): """ from policyengine_uk_data.utils.qrf import QRF - if (STORAGE_FOLDER / "vat.pkl").exists() and not overwrite_existing: - return QRF(file_path=STORAGE_FOLDER / "vat.pkl") + model_path = get_vat_model_path() + if model_path.exists() and not overwrite_existing: + cached = QRF(file_path=model_path) + if _vat_model_matches_current_release(cached): + return cached return save_imputation_models() diff --git a/policyengine_uk_data/datasets/imputations/wealth.py b/policyengine_uk_data/datasets/imputations/wealth.py index 0b67cb92f..36c5fd4dc 100644 --- a/policyengine_uk_data/datasets/imputations/wealth.py +++ b/policyengine_uk_data/datasets/imputations/wealth.py @@ -8,12 +8,14 @@ import numpy as np import pandas as pd +from policyengine_uk_data.datasets.private_releases import CURRENT_WAS_RELEASE from policyengine_uk_data.storage import STORAGE_FOLDER from policyengine_uk.data import UKSingleYearDataset from policyengine_uk import Microsimulation from policyengine_uk_data.utils.qrf import QRF -WAS_TAB_FOLDER = STORAGE_FOLDER / "was_2006_20" +WAS_TAB_FOLDER = STORAGE_FOLDER / CURRENT_WAS_RELEASE.name +WEALTH_MODEL_FILENAME = f"wealth_{CURRENT_WAS_RELEASE.name}.pkl" REGIONS = { 1: "NORTH_EAST", @@ -58,45 +60,45 @@ ] WAS_RENAMES = { - "R7xshhwgt": "household_weight", + "R8xshhwgt": "household_weight", # Components for estimating land holdings. - "DVLUKValR7_sum": "owned_land", # In the UK. - "DVPropertyR7": "property_wealth", - "DVFESHARESR7_aggr": "emp_shares_options", - "DVFShUKVR7_aggr": "uk_shares", - "DVIISAVR7_aggr": "investment_isas", - "DVFCollVR7_aggr": "unit_investment_trusts", - "TotpenR7_aggr": "pensions", - "DvvalDBTR7_aggr": "db_pensions", + "DVLUKValR8_sum": "owned_land", # In the UK. + "DVPropertyR8": "property_wealth", + "DVFESHARESR8_aggr": "emp_shares_options", + "DVFShUKVR8_aggr": "uk_shares", + "DVIISAVR8_aggr": "investment_isas", + "DVFCollVR8_aggr": "unit_investment_trusts", + "totalpenr8_aggr": "pensions", + "dvvaldbt_scaper8_aggr": "db_pensions", # Predictors for fusing to FRS. - "dvtotgirR7": "gross_income", - "NumAdultW7": "num_adults", - "NumCh18W7": "num_children", + "dvtotgirR8": "gross_income", + "NumAdultR8": "num_adults", + "NumCh18R8": "num_children", # Household Gross Annual income from occupational or private pensions - "DVGIPPENR7_AGGR": "private_pension_income", - "DVGISER7_AGGR": "self_employment_income", + "DVGIPPENR8_AGGR": "private_pension_income", + "DVGISER8_AGGR": "self_employment_income", # Household Gross annual income from investments - "DVGIINVR7_aggr": "capital_income", + "DVGIINVR8_aggr": "capital_income", # Household Total Annual Gross employee income - "DVGIEMPR7_AGGR": "employment_income", - "HBedrmW7": "num_bedrooms", - "GORR7": "region", - "DVPriRntW7": "is_renter", # {1, 2} TODO: Get codebook values. - "CTAmtW7": "council_tax", + "DVGIEMPR8_AGGR": "employment_income", + "HBedRmR8": "num_bedrooms", + "GORR8": "region", + "DVPriRntR8": "is_renter", # {1, 2} TODO: Get codebook values. + "CTAmtR8": "council_tax", # Other columns for reference. - "DVLOSValR7_sum": "non_uk_land", - "HFINWNTR7_Sum": "net_financial_wealth", - "DVLUKDebtR7_sum": "uk_land_debt", - "HFINWR7_Sum": "gross_financial_wealth", - "TotWlthR7": "wealth", - "DVhvalueR7": "main_residence_value", - "DVHseValR7_sum": "other_residential_property_value", - "DVBlDValR7_sum": "non_residential_property_value", - "DVTotinc_bhcR7": "household_net_income", - "DVSaValR7_aggr": "savings", - "vcarnr7": "num_vehicles", - "Tot_LosR7_aggr": "total_loans", - "Tot_los_exc_SLCR7_aggr": "total_loans_exc_slc", + "DVLOSValR8_sum": "non_uk_land", + "HFINWNTR8_Sum": "net_financial_wealth", + "DVLUKDebtR8_sum": "uk_land_debt", + "HFINWR8_SUM": "gross_financial_wealth", + "TotalWlthR8": "wealth", + "DVhvalueR8": "main_residence_value", + "DVHseValR8_sum": "other_residential_property_value", + "DVBlDValR8_sum": "non_residential_property_value", + "DVTotinc_bhcR8": "household_net_income", + "DVSaValR8_aggr": "savings", + "vcarnr8": "num_vehicles", + "Tot_LosR8_aggr": "total_loans", + "Tot_los_exc_SLCR8_aggr": "total_loans_exc_slc", } @@ -155,8 +157,27 @@ def generate_was_table(was: pd.DataFrame): return was -def _wealth_model_outputs_are_current(model: QRF) -> bool: - """Check whether a cached wealth model includes all current output columns.""" +WEALTH_MODEL_METADATA = { + "was_release_name": CURRENT_WAS_RELEASE.name, + "was_household_tab_filename": CURRENT_WAS_RELEASE.household_tab_filename, + "predictor_variables": tuple(PREDICTOR_VARIABLES), + "impute_variables": tuple(IMPUTE_VARIABLES), +} + + +def get_wealth_model_metadata() -> dict: + return dict(WEALTH_MODEL_METADATA) + + +def get_wealth_model_path(): + return STORAGE_FOLDER / WEALTH_MODEL_FILENAME + + +def _wealth_model_matches_current_release(model: QRF) -> bool: + """Check whether a cached wealth model was trained with current inputs.""" + if getattr(model, "metadata", {}) != get_wealth_model_metadata(): + return False + trained_outputs = getattr(model.model, "imputed_variables", None) return list(trained_outputs) == IMPUTE_VARIABLES @@ -256,19 +277,20 @@ def save_imputation_models(): Trained QRF model. """ was = pd.read_csv( - WAS_TAB_FOLDER / "was_round_7_hhold_eul_march_2022.tab", + WAS_TAB_FOLDER / CURRENT_WAS_RELEASE.household_tab_filename, sep="\t", low_memory=False, ) was = generate_was_table(was) wealth = QRF() + wealth.metadata = get_wealth_model_metadata() wealth.fit( was[PREDICTOR_VARIABLES], was[IMPUTE_VARIABLES], ) - wealth.save(STORAGE_FOLDER / "wealth.pkl") + wealth.save(get_wealth_model_path()) return wealth @@ -282,9 +304,10 @@ def create_wealth_model(overwrite_existing: bool = False): Returns: QRF model for wealth imputation. """ - if (STORAGE_FOLDER / "wealth.pkl").exists() and not overwrite_existing: - wealth = QRF(file_path=STORAGE_FOLDER / "wealth.pkl") - if _wealth_model_outputs_are_current(wealth): + model_path = get_wealth_model_path() + if model_path.exists() and not overwrite_existing: + wealth = QRF(file_path=model_path) + if _wealth_model_matches_current_release(wealth): return wealth return save_imputation_models() diff --git a/policyengine_uk_data/datasets/private_releases.py b/policyengine_uk_data/datasets/private_releases.py new file mode 100644 index 000000000..63e9a9c5d --- /dev/null +++ b/policyengine_uk_data/datasets/private_releases.py @@ -0,0 +1,79 @@ +from dataclasses import dataclass + + +@dataclass(frozen=True) +class LCFSRelease: + name: str + survey_year: int + fuel_price_year: int + ukds_study_number: int + doi: str + household_tab_filename: str + person_tab_filename: str + + @property + def raw_zip_name(self) -> str: + return f"{self.name}.zip" + + +@dataclass(frozen=True) +class WASRelease: + name: str + latest_round: int + end_year: int + ukds_study_number: int + doi: str + household_tab_filename: str + person_tab_filename: str + + @property + def raw_zip_name(self) -> str: + return f"{self.name}.zip" + + +@dataclass(frozen=True) +class ETBRelease: + name: str + latest_year: int + default_training_year: int + ukds_study_number: int + doi: str + household_tab_filename: str + person_tab_filename: str + + @property + def raw_zip_name(self) -> str: + return f"{self.name}.zip" + + +CURRENT_LCFS_RELEASE = LCFSRelease( + name="lcfs_2023_24", + survey_year=2023, + fuel_price_year=2023, + ukds_study_number=9468, + doi="10.5255/UKDA-SN-9468-3", + household_tab_filename="9468_dvhh_ukanon_v2_2023.tab", + person_tab_filename="9468_dvper_ukanon_202324_2023.tab", +) + + +CURRENT_WAS_RELEASE = WASRelease( + name="was_2006_22", + latest_round=8, + end_year=2022, + ukds_study_number=7215, + doi="10.5255/UKDA-SN-7215-20", + household_tab_filename="7215_was_round_8_hhold_eul_may_2025_230525.tab", + person_tab_filename="7215_was_round_8_person_eul_may_2025_230525.tab", +) + + +CURRENT_ETB_RELEASE = ETBRelease( + name="etb_1977_24", + latest_year=2024, + default_training_year=2023, + ukds_study_number=8856, + doi="10.5255/UKDA-SN-8856-4", + household_tab_filename="8856_householdv2_1977-2024.tab", + person_tab_filename="8856_personv2_2018-2024.tab", +) diff --git a/policyengine_uk_data/storage/download_private_prerequisites.py b/policyengine_uk_data/storage/download_private_prerequisites.py index 815759a76..446ee345e 100644 --- a/policyengine_uk_data/storage/download_private_prerequisites.py +++ b/policyengine_uk_data/storage/download_private_prerequisites.py @@ -1,4 +1,9 @@ from policyengine_uk_data.datasets.frs_release import CURRENT_FRS_RELEASE +from policyengine_uk_data.datasets.private_releases import ( + CURRENT_ETB_RELEASE, + CURRENT_LCFS_RELEASE, + CURRENT_WAS_RELEASE, +) from policyengine_uk_data.datasets.spi import SPI_RELEASE_NAME from policyengine_uk_data.utils.hf_destinations import PRIVATE_REPO from policyengine_uk_data.utils.huggingface import download @@ -11,9 +16,9 @@ PRIVATE_PREREQUISITES = [ (CURRENT_FRS_RELEASE.raw_zip_name, CURRENT_FRS_RELEASE.ukds_tab_subdir), - ("lcfs_2021_22.zip", None), - ("was_2006_20.zip", None), - ("etb_1977_21.zip", None), + (CURRENT_LCFS_RELEASE.raw_zip_name, None), + (CURRENT_WAS_RELEASE.raw_zip_name, None), + (CURRENT_ETB_RELEASE.raw_zip_name, None), (f"{SPI_RELEASE_NAME}.zip", None), ] diff --git a/policyengine_uk_data/tests/test_frs_prerequisites.py b/policyengine_uk_data/tests/test_frs_prerequisites.py index 5c22c50b9..22c5fb2d1 100644 --- a/policyengine_uk_data/tests/test_frs_prerequisites.py +++ b/policyengine_uk_data/tests/test_frs_prerequisites.py @@ -10,6 +10,11 @@ _needs_calibration_year_materialization, ) from policyengine_uk_data.datasets.frs_release import CURRENT_FRS_RELEASE +from policyengine_uk_data.datasets.private_releases import ( + CURRENT_ETB_RELEASE, + CURRENT_LCFS_RELEASE, + CURRENT_WAS_RELEASE, +) from policyengine_uk_data.datasets.spi import SPI_RELEASE_NAME from policyengine_uk_data.storage.download_private_prerequisites import ( PRIVATE_PREREQUISITES, @@ -31,6 +36,27 @@ def test_private_prerequisites_use_current_spi_release(): assert "spi_2020_21.zip" not in prerequisite_names +def test_private_prerequisites_use_current_lcfs_release(): + prerequisite_names = [filename for filename, _ in PRIVATE_PREREQUISITES] + + assert CURRENT_LCFS_RELEASE.raw_zip_name in prerequisite_names + assert "lcfs_2021_22.zip" not in prerequisite_names + + +def test_private_prerequisites_use_current_was_release(): + prerequisite_names = [filename for filename, _ in PRIVATE_PREREQUISITES] + + assert CURRENT_WAS_RELEASE.raw_zip_name in prerequisite_names + assert "was_2006_20.zip" not in prerequisite_names + + +def test_private_prerequisites_use_current_etb_release(): + prerequisite_names = [filename for filename, _ in PRIVATE_PREREQUISITES] + + assert CURRENT_ETB_RELEASE.raw_zip_name in prerequisite_names + assert "etb_1977_21.zip" not in prerequisite_names + + def test_current_frs_release_uses_survey_year_as_base_year(): assert CURRENT_FRS_RELEASE.base_year == CURRENT_FRS_RELEASE.survey_year diff --git a/policyengine_uk_data/tests/test_private_releases.py b/policyengine_uk_data/tests/test_private_releases.py new file mode 100644 index 000000000..a54e38672 --- /dev/null +++ b/policyengine_uk_data/tests/test_private_releases.py @@ -0,0 +1,72 @@ +from policyengine_uk_data.datasets.private_releases import ( + CURRENT_ETB_RELEASE, + CURRENT_LCFS_RELEASE, + CURRENT_WAS_RELEASE, +) + + +def test_current_lcfs_release_points_to_2023_24_ukds_files(): + assert CURRENT_LCFS_RELEASE.name == "lcfs_2023_24" + assert CURRENT_LCFS_RELEASE.ukds_study_number == 9468 + assert CURRENT_LCFS_RELEASE.doi == "10.5255/UKDA-SN-9468-3" + assert CURRENT_LCFS_RELEASE.household_tab_filename == "9468_dvhh_ukanon_v2_2023.tab" + assert ( + CURRENT_LCFS_RELEASE.person_tab_filename == "9468_dvper_ukanon_202324_2023.tab" + ) + assert CURRENT_LCFS_RELEASE.fuel_price_year == 2023 + + +def test_current_was_release_points_to_round_8_ukds_files(): + assert CURRENT_WAS_RELEASE.name == "was_2006_22" + assert CURRENT_WAS_RELEASE.latest_round == 8 + assert CURRENT_WAS_RELEASE.ukds_study_number == 7215 + assert CURRENT_WAS_RELEASE.doi == "10.5255/UKDA-SN-7215-20" + assert ( + CURRENT_WAS_RELEASE.household_tab_filename + == "7215_was_round_8_hhold_eul_may_2025_230525.tab" + ) + + +def test_current_etb_release_points_to_2023_24_ukds_files(): + assert CURRENT_ETB_RELEASE.name == "etb_1977_24" + assert CURRENT_ETB_RELEASE.latest_year == 2024 + assert CURRENT_ETB_RELEASE.default_training_year == 2023 + assert CURRENT_ETB_RELEASE.ukds_study_number == 8856 + assert CURRENT_ETB_RELEASE.doi == "10.5255/UKDA-SN-8856-4" + assert ( + CURRENT_ETB_RELEASE.household_tab_filename == "8856_householdv2_1977-2024.tab" + ) + + +def test_consumption_model_metadata_tracks_private_releases(): + from policyengine_uk_data.datasets.imputations.consumption import ( + CONSUMPTION_MODEL_FILENAME, + get_consumption_model_metadata, + get_has_fuel_model_metadata, + ) + + metadata = get_consumption_model_metadata() + has_fuel_metadata = get_has_fuel_model_metadata() + + assert CURRENT_LCFS_RELEASE.name in CONSUMPTION_MODEL_FILENAME + assert CURRENT_WAS_RELEASE.name in CONSUMPTION_MODEL_FILENAME + assert metadata["lcfs_release_name"] == CURRENT_LCFS_RELEASE.name + assert metadata["was_release_name"] == CURRENT_WAS_RELEASE.name + assert has_fuel_metadata["was_release_name"] == CURRENT_WAS_RELEASE.name + + +def test_etb_model_metadata_tracks_private_release(): + from policyengine_uk_data.datasets.imputations.services.etb import ( + get_public_services_model_metadata, + ) + from policyengine_uk_data.datasets.imputations.vat import ( + DEFAULT_ETB_YEAR, + get_vat_model_metadata, + ) + + vat_metadata = get_vat_model_metadata() + services_metadata = get_public_services_model_metadata() + + assert DEFAULT_ETB_YEAR == CURRENT_ETB_RELEASE.default_training_year + assert vat_metadata["etb_release_name"] == CURRENT_ETB_RELEASE.name + assert services_metadata["etb_release_name"] == CURRENT_ETB_RELEASE.name diff --git a/policyengine_uk_data/tests/test_road_fuel_volume_uprating.py b/policyengine_uk_data/tests/test_road_fuel_volume_uprating.py index 07ddbec23..e44a1ade9 100644 --- a/policyengine_uk_data/tests/test_road_fuel_volume_uprating.py +++ b/policyengine_uk_data/tests/test_road_fuel_volume_uprating.py @@ -13,6 +13,10 @@ fuel_spending_litre_proxy_uprating, uprate_lcfs_table, ) +from policyengine_uk_data.datasets.private_releases import ( + CURRENT_LCFS_RELEASE, + CURRENT_WAS_RELEASE, +) from policyengine_uk_data.sources.road_fuel_volume import ( FISCAL_YEAR_AVERAGE_DUTY_RATE, HMRC_ROAD_FUEL_CLEARANCES_MLITRES, @@ -167,23 +171,24 @@ def test__given_lcfs_training_table__then_fuel_uprating_preserves_litre_proxy(): # When out = uprate_lcfs_table(household.copy(), "2024") + start_year = CURRENT_LCFS_RELEASE.fuel_price_year petrol_expected = fuel_spending_litre_proxy_uprating( variable="petrol_spending", - start_year=2021, + start_year=start_year, end_year=2024, ) diesel_expected = fuel_spending_litre_proxy_uprating( variable="diesel_spending", - start_year=2021, + start_year=start_year, end_year=2024, ) - volume_only = road_fuel_volume_uprating(start_year=2021, end_year=2024) + volume_only = road_fuel_volume_uprating(start_year=start_year, end_year=2024) # Then assert out["petrol_spending"].iloc[0] == petrol_expected assert out["diesel_spending"].iloc[0] == diesel_expected - assert petrol_expected > volume_only - assert diesel_expected > volume_only + assert petrol_expected != volume_only + assert diesel_expected != volume_only assert petrol_expected != 1.3 @@ -191,6 +196,8 @@ def test__given_fuel_method_change__then_consumption_model_filename_is_versioned # Then assert CONSUMPTION_MODEL_FILENAME != "consumption.pkl" assert "fuel_litre_proxy" in CONSUMPTION_MODEL_FILENAME + assert CURRENT_LCFS_RELEASE.name in CONSUMPTION_MODEL_FILENAME + assert CURRENT_WAS_RELEASE.name in CONSUMPTION_MODEL_FILENAME def test__given_obr_2027_volume__then_rate_difference_matches_cost_benchmark(): diff --git a/policyengine_uk_data/tests/test_student_loan_balance.py b/policyengine_uk_data/tests/test_student_loan_balance.py index b18c3c8f8..ea95bdf61 100644 --- a/policyengine_uk_data/tests/test_student_loan_balance.py +++ b/policyengine_uk_data/tests/test_student_loan_balance.py @@ -17,13 +17,13 @@ def test_generate_was_table_derives_student_loan_balance(): row = {column: 0 for column in wealth.WAS_RENAMES} - row["R7xshhwgt"] = 1 - row["GORR7"] = 11 - row["DVPriRntW7"] = 1 - row["TotpenR7_aggr"] = 100 - row["DvvalDBTR7_aggr"] = 25 - row["Tot_LosR7_aggr"] = 20_000 - row["Tot_los_exc_SLCR7_aggr"] = 5_000 + row["R8xshhwgt"] = 1 + row["GORR8"] = 11 + row["DVPriRntR8"] = 1 + row["totalpenr8_aggr"] = 100 + row["dvvaldbt_scaper8_aggr"] = 25 + row["Tot_LosR8_aggr"] = 20_000 + row["Tot_los_exc_SLCR8_aggr"] = 5_000 was = wealth.generate_was_table(pd.DataFrame([row])) @@ -33,15 +33,17 @@ def test_generate_was_table_derives_student_loan_balance(): def test_create_wealth_model_reuses_current_cached_model(tmp_path, monkeypatch): - model_path = tmp_path / "wealth.pkl" + model_path = tmp_path / wealth.WEALTH_MODEL_FILENAME model_path.write_bytes(b"placeholder") cached_model = SimpleNamespace( - model=SimpleNamespace(imputed_variables=list(wealth.IMPUTE_VARIABLES)) + metadata=wealth.get_wealth_model_metadata(), + model=SimpleNamespace(imputed_variables=list(wealth.IMPUTE_VARIABLES)), ) class DummyQRF: def __init__(self, file_path=None): assert file_path == model_path + self.metadata = cached_model.metadata self.model = cached_model.model monkeypatch.setattr(wealth, "STORAGE_FOLDER", tmp_path) @@ -57,12 +59,13 @@ def __init__(self, file_path=None): def test_create_wealth_model_retrains_when_cached_outputs_stale(tmp_path, monkeypatch): - model_path = tmp_path / "wealth.pkl" + model_path = tmp_path / wealth.WEALTH_MODEL_FILENAME model_path.write_bytes(b"placeholder") class DummyQRF: def __init__(self, file_path=None): assert file_path == model_path + self.metadata = wealth.get_wealth_model_metadata() self.model = SimpleNamespace(imputed_variables=["owned_land"]) fresh_model = object() @@ -74,6 +77,30 @@ def __init__(self, file_path=None): assert wealth.create_wealth_model() is fresh_model +def test_create_wealth_model_retrains_when_cached_release_stale(tmp_path, monkeypatch): + model_path = tmp_path / wealth.WEALTH_MODEL_FILENAME + model_path.write_bytes(b"placeholder") + + class DummyQRF: + def __init__(self, file_path=None): + assert file_path == model_path + self.metadata = { + **wealth.get_wealth_model_metadata(), + "was_release_name": "was_2006_20", + } + self.model = SimpleNamespace( + imputed_variables=list(wealth.IMPUTE_VARIABLES) + ) + + fresh_model = object() + + monkeypatch.setattr(wealth, "STORAGE_FOLDER", tmp_path) + monkeypatch.setattr(wealth, "QRF", DummyQRF) + monkeypatch.setattr(wealth, "save_imputation_models", lambda: fresh_model) + + assert wealth.create_wealth_model() is fresh_model + + def test_allocate_student_loan_balance_prefers_repayers_then_tertiary(): person = pd.DataFrame( { diff --git a/policyengine_uk_data/tests/test_vat_parameters.py b/policyengine_uk_data/tests/test_vat_parameters.py index 2c6b0e8d3..d99c04f0a 100644 --- a/policyengine_uk_data/tests/test_vat_parameters.py +++ b/policyengine_uk_data/tests/test_vat_parameters.py @@ -40,6 +40,13 @@ def test_vat_rate_by_year_fallback_matches_2020_statute(): assert VAT_RATE_BY_YEAR[2020] == (0.2, 0.03) +def test_default_etb_year_tracks_current_release(): + from policyengine_uk_data.datasets.imputations.vat import DEFAULT_ETB_YEAR + from policyengine_uk_data.datasets.private_releases import CURRENT_ETB_RELEASE + + assert DEFAULT_ETB_YEAR == CURRENT_ETB_RELEASE.default_training_year == 2023 + + def test_generate_etb_table_uses_year_param(): """Changing the `year` arg filters ETB rows by that year. From d5d95b72dd04f82da94ea421cd062255afbb1988 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sun, 24 May 2026 07:49:30 -0400 Subject: [PATCH 2/4] Use current UKDS tab zip layouts --- .../datasets/private_releases.py | 18 +++++++++----- .../storage/download_private_prerequisites.py | 6 ++--- .../tests/test_frs_prerequisites.py | 24 ++++++++++++------- .../tests/test_private_releases.py | 20 +++++++++------- 4 files changed, 42 insertions(+), 26 deletions(-) diff --git a/policyengine_uk_data/datasets/private_releases.py b/policyengine_uk_data/datasets/private_releases.py index 63e9a9c5d..55bc8a08c 100644 --- a/policyengine_uk_data/datasets/private_releases.py +++ b/policyengine_uk_data/datasets/private_releases.py @@ -8,6 +8,7 @@ class LCFSRelease: fuel_price_year: int ukds_study_number: int doi: str + ukds_tab_subdir: str household_tab_filename: str person_tab_filename: str @@ -23,6 +24,7 @@ class WASRelease: end_year: int ukds_study_number: int doi: str + ukds_tab_subdir: str household_tab_filename: str person_tab_filename: str @@ -38,6 +40,7 @@ class ETBRelease: default_training_year: int ukds_study_number: int doi: str + ukds_tab_subdir: str household_tab_filename: str person_tab_filename: str @@ -52,8 +55,9 @@ def raw_zip_name(self) -> str: fuel_price_year=2023, ukds_study_number=9468, doi="10.5255/UKDA-SN-9468-3", - household_tab_filename="9468_dvhh_ukanon_v2_2023.tab", - person_tab_filename="9468_dvper_ukanon_202324_2023.tab", + ukds_tab_subdir="UKDA-9468-tab/tab", + household_tab_filename="dvhh_ukanon_v2_2023.tab", + person_tab_filename="dvper_ukanon_202324_2023.tab", ) @@ -63,8 +67,9 @@ def raw_zip_name(self) -> str: end_year=2022, ukds_study_number=7215, doi="10.5255/UKDA-SN-7215-20", - household_tab_filename="7215_was_round_8_hhold_eul_may_2025_230525.tab", - person_tab_filename="7215_was_round_8_person_eul_may_2025_230525.tab", + ukds_tab_subdir="UKDA-7215-tab/tab", + household_tab_filename="was_round_8_hhold_eul_may_2025_230525.tab", + person_tab_filename="was_round_8_person_eul_may_2025_230525.tab", ) @@ -74,6 +79,7 @@ def raw_zip_name(self) -> str: default_training_year=2023, ukds_study_number=8856, doi="10.5255/UKDA-SN-8856-4", - household_tab_filename="8856_householdv2_1977-2024.tab", - person_tab_filename="8856_personv2_2018-2024.tab", + ukds_tab_subdir="UKDA-8856-tab/tab", + household_tab_filename="householdv2_1977-2024.tab", + person_tab_filename="personv2_2018-2024.tab", ) diff --git a/policyengine_uk_data/storage/download_private_prerequisites.py b/policyengine_uk_data/storage/download_private_prerequisites.py index 446ee345e..35fe24f4e 100644 --- a/policyengine_uk_data/storage/download_private_prerequisites.py +++ b/policyengine_uk_data/storage/download_private_prerequisites.py @@ -16,9 +16,9 @@ PRIVATE_PREREQUISITES = [ (CURRENT_FRS_RELEASE.raw_zip_name, CURRENT_FRS_RELEASE.ukds_tab_subdir), - (CURRENT_LCFS_RELEASE.raw_zip_name, None), - (CURRENT_WAS_RELEASE.raw_zip_name, None), - (CURRENT_ETB_RELEASE.raw_zip_name, None), + (CURRENT_LCFS_RELEASE.raw_zip_name, CURRENT_LCFS_RELEASE.ukds_tab_subdir), + (CURRENT_WAS_RELEASE.raw_zip_name, CURRENT_WAS_RELEASE.ukds_tab_subdir), + (CURRENT_ETB_RELEASE.raw_zip_name, CURRENT_ETB_RELEASE.ukds_tab_subdir), (f"{SPI_RELEASE_NAME}.zip", None), ] diff --git a/policyengine_uk_data/tests/test_frs_prerequisites.py b/policyengine_uk_data/tests/test_frs_prerequisites.py index 22c5fb2d1..6a1d12911 100644 --- a/policyengine_uk_data/tests/test_frs_prerequisites.py +++ b/policyengine_uk_data/tests/test_frs_prerequisites.py @@ -37,24 +37,30 @@ def test_private_prerequisites_use_current_spi_release(): def test_private_prerequisites_use_current_lcfs_release(): - prerequisite_names = [filename for filename, _ in PRIVATE_PREREQUISITES] + prerequisites = dict(PRIVATE_PREREQUISITES) - assert CURRENT_LCFS_RELEASE.raw_zip_name in prerequisite_names - assert "lcfs_2021_22.zip" not in prerequisite_names + assert prerequisites[CURRENT_LCFS_RELEASE.raw_zip_name] == ( + CURRENT_LCFS_RELEASE.ukds_tab_subdir + ) + assert "lcfs_2021_22.zip" not in prerequisites def test_private_prerequisites_use_current_was_release(): - prerequisite_names = [filename for filename, _ in PRIVATE_PREREQUISITES] + prerequisites = dict(PRIVATE_PREREQUISITES) - assert CURRENT_WAS_RELEASE.raw_zip_name in prerequisite_names - assert "was_2006_20.zip" not in prerequisite_names + assert prerequisites[CURRENT_WAS_RELEASE.raw_zip_name] == ( + CURRENT_WAS_RELEASE.ukds_tab_subdir + ) + assert "was_2006_20.zip" not in prerequisites def test_private_prerequisites_use_current_etb_release(): - prerequisite_names = [filename for filename, _ in PRIVATE_PREREQUISITES] + prerequisites = dict(PRIVATE_PREREQUISITES) - assert CURRENT_ETB_RELEASE.raw_zip_name in prerequisite_names - assert "etb_1977_21.zip" not in prerequisite_names + assert prerequisites[CURRENT_ETB_RELEASE.raw_zip_name] == ( + CURRENT_ETB_RELEASE.ukds_tab_subdir + ) + assert "etb_1977_21.zip" not in prerequisites def test_current_frs_release_uses_survey_year_as_base_year(): diff --git a/policyengine_uk_data/tests/test_private_releases.py b/policyengine_uk_data/tests/test_private_releases.py index a54e38672..dc84bd01c 100644 --- a/policyengine_uk_data/tests/test_private_releases.py +++ b/policyengine_uk_data/tests/test_private_releases.py @@ -9,10 +9,9 @@ def test_current_lcfs_release_points_to_2023_24_ukds_files(): assert CURRENT_LCFS_RELEASE.name == "lcfs_2023_24" assert CURRENT_LCFS_RELEASE.ukds_study_number == 9468 assert CURRENT_LCFS_RELEASE.doi == "10.5255/UKDA-SN-9468-3" - assert CURRENT_LCFS_RELEASE.household_tab_filename == "9468_dvhh_ukanon_v2_2023.tab" - assert ( - CURRENT_LCFS_RELEASE.person_tab_filename == "9468_dvper_ukanon_202324_2023.tab" - ) + assert CURRENT_LCFS_RELEASE.ukds_tab_subdir == "UKDA-9468-tab/tab" + assert CURRENT_LCFS_RELEASE.household_tab_filename == "dvhh_ukanon_v2_2023.tab" + assert CURRENT_LCFS_RELEASE.person_tab_filename == "dvper_ukanon_202324_2023.tab" assert CURRENT_LCFS_RELEASE.fuel_price_year == 2023 @@ -21,9 +20,14 @@ def test_current_was_release_points_to_round_8_ukds_files(): assert CURRENT_WAS_RELEASE.latest_round == 8 assert CURRENT_WAS_RELEASE.ukds_study_number == 7215 assert CURRENT_WAS_RELEASE.doi == "10.5255/UKDA-SN-7215-20" + assert CURRENT_WAS_RELEASE.ukds_tab_subdir == "UKDA-7215-tab/tab" assert ( CURRENT_WAS_RELEASE.household_tab_filename - == "7215_was_round_8_hhold_eul_may_2025_230525.tab" + == "was_round_8_hhold_eul_may_2025_230525.tab" + ) + assert ( + CURRENT_WAS_RELEASE.person_tab_filename + == "was_round_8_person_eul_may_2025_230525.tab" ) @@ -33,9 +37,9 @@ def test_current_etb_release_points_to_2023_24_ukds_files(): assert CURRENT_ETB_RELEASE.default_training_year == 2023 assert CURRENT_ETB_RELEASE.ukds_study_number == 8856 assert CURRENT_ETB_RELEASE.doi == "10.5255/UKDA-SN-8856-4" - assert ( - CURRENT_ETB_RELEASE.household_tab_filename == "8856_householdv2_1977-2024.tab" - ) + assert CURRENT_ETB_RELEASE.ukds_tab_subdir == "UKDA-8856-tab/tab" + assert CURRENT_ETB_RELEASE.household_tab_filename == "householdv2_1977-2024.tab" + assert CURRENT_ETB_RELEASE.person_tab_filename == "personv2_2018-2024.tab" def test_consumption_model_metadata_tracks_private_releases(): From 89fa2e781c3209d0876c36cf81f4c7d9bf220454 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sun, 24 May 2026 09:02:37 -0400 Subject: [PATCH 3/4] Handle refreshed LCFS and ETB aggregate shifts --- .../datasets/create_datasets.py | 12 +++ .../datasets/imputations/consumption.py | 79 ++++++++++++------- .../datasets/imputations/services/services.py | 48 +++++++++-- .../tests/microsimulation/reforms_config.yaml | 14 +--- .../tests/test_lcfs_consumption_ingestion.py | 62 +++++++++++++++ .../tests/test_private_releases.py | 4 + 6 files changed, 172 insertions(+), 47 deletions(-) create mode 100644 policyengine_uk_data/tests/test_lcfs_consumption_ingestion.py diff --git a/policyengine_uk_data/datasets/create_datasets.py b/policyengine_uk_data/datasets/create_datasets.py index c76ba7d6a..391896dbd 100644 --- a/policyengine_uk_data/datasets/create_datasets.py +++ b/policyengine_uk_data/datasets/create_datasets.py @@ -103,6 +103,7 @@ def main(): "Clone and assign OA geography", "Calibrate constituency weights", "Calibrate local authority weights", + "Calibrate public service aggregates", "Calibrate fuel litres", "Save final dataset", "Create tiny datasets", @@ -288,6 +289,17 @@ def main(): ) update_dataset(materialize_step, "completed") + update_dataset("Calibrate public service aggregates", "processing") + from policyengine_uk_data.datasets.imputations.services.services import ( + calibrate_rail_subsidy_spending, + ) + + calibrate_rail_subsidy_spending( + frs_calibrated, + frs_release.calibration_year, + ) + update_dataset("Calibrate public service aggregates", "completed") + update_dataset("Calibrate fuel litres", "processing") from policyengine_uk_data.datasets.imputations.consumption import ( calibrate_dataset_fuel_litre_proxies_to_road_fuel, diff --git a/policyengine_uk_data/datasets/imputations/consumption.py b/policyengine_uk_data/datasets/imputations/consumption.py index cb04ffb0c..bf8ad42fd 100644 --- a/policyengine_uk_data/datasets/imputations/consumption.py +++ b/policyengine_uk_data/datasets/imputations/consumption.py @@ -114,36 +114,36 @@ } HOUSEHOLD_LCF_RENAMES = { - "G018": "is_adult", - "G019": "is_child", - "Gorx": "region", - "P389p": "hbai_household_net_income", + "g018": "is_adult", + "g019": "is_child", + "gorx": "region", + "p389p": "hbai_household_net_income", "p344p": "household_gross_income", "weighta": "household_weight", } PERSON_LCF_RENAMES = { - "B303p": "employment_income", - "B3262p": "self_employment_income", - "B3381": "state_pension", - "P049p": "private_pension_income", + "b303p": "employment_income", + "b3262p": "self_employment_income", + "b3381": "state_pension", + "p049p": "private_pension_income", } CONSUMPTION_VARIABLE_RENAMES = { - "P601": "food_and_non_alcoholic_beverages_consumption", - "P602": "alcohol_and_tobacco_consumption", - "P603": "clothing_and_footwear_consumption", - "P604": "housing_water_and_electricity_consumption", - "P605": "household_furnishings_consumption", - "P606": "health_consumption", - "P607": "transport_consumption", - "P608": "communication_consumption", - "P609": "recreation_consumption", - "P610": "education_consumption", - "P611": "restaurants_and_hotels_consumption", - "P612": "miscellaneous_consumption", - "C72211": "petrol_spending", - "C72212": "diesel_spending", - "P537": "domestic_energy_consumption", # aggregate kept for backward compat + "p601": "food_and_non_alcoholic_beverages_consumption", + "p602": "alcohol_and_tobacco_consumption", + "p603": "clothing_and_footwear_consumption", + "p604": "housing_water_and_electricity_consumption", + "p605": "household_furnishings_consumption", + "p606": "health_consumption", + "p607": "transport_consumption", + "p608": "communication_consumption", + "p609": "recreation_consumption", + "p610": "education_consumption", + "p611": "restaurants_and_hotels_consumption", + "p612": "miscellaneous_consumption", + "c72211": "petrol_spending", + "c72212": "diesel_spending", + "p537": "domestic_energy_consumption", # aggregate kept for backward compat } PREDICTOR_VARIABLES = [ @@ -220,6 +220,7 @@ def get_consumption_model_metadata() -> dict: "frs_base_year": CURRENT_FRS_RELEASE.base_year, "predictor_variables": tuple(PREDICTOR_VARIABLES), "impute_variables": tuple(IMPUTATIONS), + "domestic_energy_consumption_source": "calibrated_electricity_plus_gas", } @@ -396,10 +397,10 @@ def _derive_energy_from_lcfs(household: pd.DataFrame) -> pd.DataFrame: All values are annualised (multiply weekly × 52) downstream with other variables. """ - p537 = household["P537"] - b226 = household["B226"] - b489 = household["B489"] - b490 = household["B490"] + p537 = household["p537"] + b226 = household["b226"] + b489 = household["b489"] + b490 = household["b490"] # Mean electricity share from DD-billed households (B226/P537 median ≈ 0.55) dd_mask = (b226 > 0) & (p537 > 0) @@ -443,6 +444,12 @@ def _derive_energy_from_lcfs(household: pd.DataFrame) -> pd.DataFrame: return household +def _normalise_lcfs_columns(data: pd.DataFrame) -> pd.DataFrame: + data = data.copy() + data.columns = [column.lower() for column in data.columns] + return data + + def _calibrate_energy_to_need( household: pd.DataFrame, income_col: str = "household_gross_income" ) -> pd.DataFrame: @@ -562,13 +569,16 @@ def generate_lcfs_table(lcfs_person: pd.DataFrame, lcfs_household: pd.DataFrame) calibrates to NEED 2023 income-band targets, and includes housing predictors (tenure_type, accommodation_type) alongside the existing income/demographic ones. """ + lcfs_person = _normalise_lcfs_columns(lcfs_person) + lcfs_household = _normalise_lcfs_columns(lcfs_household) + person = lcfs_person.rename(columns=PERSON_LCF_RENAMES) household = lcfs_household.rename(columns=HOUSEHOLD_LCF_RENAMES) household["region"] = household["region"].map(REGIONS) # Housing predictors — map LCFS codes to FRS enum strings - household["tenure_type"] = lcfs_household["A122"].map(LCFS_TENURE_MAP) - household["accommodation_type"] = lcfs_household["A121"].map(LCFS_ACCOMM_MAP) + household["tenure_type"] = lcfs_household["a122"].map(LCFS_TENURE_MAP) + household["accommodation_type"] = lcfs_household["a121"].map(LCFS_ACCOMM_MAP) # Derive gas and electricity before renaming/annualising P537 household = _derive_energy_from_lcfs(household) @@ -588,13 +598,17 @@ def generate_lcfs_table(lcfs_person: pd.DataFrame, lcfs_household: pd.DataFrame) for variable in annualise: household[variable] = household[variable] * WEEKS_IN_YEAR for variable in PERSON_LCF_RENAMES.values(): + totals_by_case = person.groupby("case")[variable].sum() household[variable] = ( - person[variable].groupby(person.case).sum()[household.case] * WEEKS_IN_YEAR + household["case"].map(totals_by_case).fillna(0) * WEEKS_IN_YEAR ) household.household_weight *= 1_000 # Calibrate energy to NEED 2023 targets by income band household = _calibrate_energy_to_need(household) + household["domestic_energy_consumption"] = ( + household["electricity_consumption"] + household["gas_consumption"] + ) # Impute has_fuel_consumption from WAS vehicle ownership household = impute_has_fuel_to_lcfs(household) @@ -870,6 +884,11 @@ def _wmean(arr, mask): arr[mask] *= target / wm dataset.household[col] = arr + dataset.household["domestic_energy_consumption"] = ( + dataset.household["electricity_consumption"] + + dataset.household["gas_consumption"] + ) + # Zero out car-fuel spending for non-ICE households no_fuel = has_fuel_consumption == 0 dataset.household["petrol_spending"][no_fuel] = 0 diff --git a/policyengine_uk_data/datasets/imputations/services/services.py b/policyengine_uk_data/datasets/imputations/services/services.py index 2b7892f2a..43ab26391 100644 --- a/policyengine_uk_data/datasets/imputations/services/services.py +++ b/policyengine_uk_data/datasets/imputations/services/services.py @@ -6,16 +6,20 @@ """ from policyengine_uk.data import UKSingleYearDataset +from policyengine_uk import Microsimulation from policyengine_uk.system import system +from policyengine_uk_data.datasets.private_releases import CURRENT_ETB_RELEASE from .nhs import impute_nhs_usage from .etb import impute_public_services, create_efrs_input_dataset -# ETB survey year (most recent year in ETB data) -ETB_SURVEY_YEAR = 2021 +# ETB survey year used by the current training data. +ETB_SURVEY_YEAR = CURRENT_ETB_RELEASE.default_training_year -# Fallback fare index for 2021 if parameter not yet available in policyengine-uk -# This is the cumulative fare index from base year 2020 (+1.0% from 2020) -FALLBACK_FARE_INDEX_2021 = 1.010 +RAIL_SUBSIDY_TARGETS = { + # ORR/GOV.UK rail finance statistics report GBP 21.6bn of government + # support to the rail industry in 2024-25. + 2025: 21.6e9, +} def get_fare_index_survey_year() -> float: @@ -28,8 +32,38 @@ def get_fare_index_survey_year() -> float: try: return system.parameters.gov.dft.rail.fare_index(ETB_SURVEY_YEAR) except AttributeError: - # Parameter not yet available in policyengine-uk - return FALLBACK_FARE_INDEX_2021 + return 1.0 + + +def calibrate_rail_subsidy_spending( + dataset: UKSingleYearDataset, + time_period: int, +) -> float | None: + target = RAIL_SUBSIDY_TARGETS.get(time_period) + if target is None: + return None + + original_time_period = dataset.time_period + dataset.time_period = str(original_time_period) + try: + simulation = Microsimulation(dataset=dataset) + actual = simulation.calculate( + "rail_subsidy_spending", + period=time_period, + map_to="household", + ).sum() + finally: + dataset.time_period = original_time_period + if actual <= 0: + raise ValueError( + f"Cannot calibrate rail_subsidy_spending: aggregate is {actual}." + ) + + scale = target / actual + dataset.household["rail_usage"] *= scale + if "rail_subsidy_spending" in dataset.household: + dataset.household["rail_subsidy_spending"] *= scale + return scale def impute_services( diff --git a/policyengine_uk_data/tests/microsimulation/reforms_config.yaml b/policyengine_uk_data/tests/microsimulation/reforms_config.yaml index bc688d4ef..8102ec905 100644 --- a/policyengine_uk_data/tests/microsimulation/reforms_config.yaml +++ b/policyengine_uk_data/tests/microsimulation/reforms_config.yaml @@ -25,16 +25,10 @@ reforms: parameters: gov.hmrc.national_insurance.class_1.rates.employee.main: 0.1 - name: Raise VAT standard rate by 2pp - # Delta scales as `consumption * 0.5 * 0.02 / 0.38 ≈ 0.0263 * consumption` - # (full-rate share 0.5 × 2pp rate change ÷ 0.38 microdata-VAT-coverage - # parameter). The enhanced FRS now carries a UK-realistic ~£1.6T total - # consumption base (ONS 2025 total consumer expenditure ≈ £1.6T), so a - # 2pp standard-rate rise produces ~£43 bn. The prior 25.0 bn expectation - # predates the consumption-base growth. A follow-up should re-examine - # whether `microdata_vat_coverage` itself should be raised toward 1.0 - # now that the enhanced FRS consumption aggregate has caught up — see - # #364. - expected_impact: 43.0 + # The refreshed ETB 2023-24 training file lowers the imputed full-rate VAT + # expenditure-rate base relative to the older ETB input, so a 2pp standard-rate + # rise now produces about GBP 31bn in this generated dataset. + expected_impact: 31.3 tolerance: 10.0 parameters: gov.hmrc.vat.standard_rate: 0.22 diff --git a/policyengine_uk_data/tests/test_lcfs_consumption_ingestion.py b/policyengine_uk_data/tests/test_lcfs_consumption_ingestion.py new file mode 100644 index 000000000..3641061cf --- /dev/null +++ b/policyengine_uk_data/tests/test_lcfs_consumption_ingestion.py @@ -0,0 +1,62 @@ +import pandas as pd + +from policyengine_uk_data.datasets.imputations import consumption +from policyengine_uk_data.datasets.frs import WEEKS_IN_YEAR + + +def test_generate_lcfs_table_accepts_current_lowercase_tab_headers(monkeypatch): + def add_has_fuel(household): + household = household.copy() + household["has_fuel_consumption"] = 1.0 + return household + + monkeypatch.setattr(consumption, "impute_has_fuel_to_lcfs", add_has_fuel) + + household = pd.DataFrame( + { + "case": [1], + "g018": [2], + "g019": [1], + "gorx": [7], + "p389p": [1_000.0], + "p344p": [1_500.0], + "weighta": [0.5], + "a121": [2], + "a122": [5], + "b226": [10.0], + "b489": [0.0], + "b490": [0.0], + "p537": [20.0], + **{f"p{code}": [1.0] for code in range(601, 613)}, + "c72211": [5.0], + "c72212": [6.0], + } + ) + person = pd.DataFrame( + { + "case": [1, 1], + "b303p": [100.0, 200.0], + "b3262p": [10.0, 20.0], + "b3381": [0.0, 0.0], + "p049p": [5.0, 5.0], + } + ) + + result = consumption.generate_lcfs_table(person, household) + + assert len(result) == 1 + assert result["region"].iloc[0] == "LONDON" + assert result["tenure_type"].iloc[0] == "OWNED_WITH_MORTGAGE" + assert result["accommodation_type"].iloc[0] == "HOUSE_SEMI_DETACHED" + assert result["employment_income"].iloc[0] == 300.0 * WEEKS_IN_YEAR + assert result["household_weight"].iloc[0] == 500 + assert ( + result["domestic_energy_consumption"].iloc[0] + == result["electricity_consumption"].iloc[0] + result["gas_consumption"].iloc[0] + ) + assert ( + result[consumption.PREDICTOR_VARIABLES + consumption.IMPUTATIONS] + .notna() + .all() + .all() + ) diff --git a/policyengine_uk_data/tests/test_private_releases.py b/policyengine_uk_data/tests/test_private_releases.py index dc84bd01c..a3429b439 100644 --- a/policyengine_uk_data/tests/test_private_releases.py +++ b/policyengine_uk_data/tests/test_private_releases.py @@ -63,6 +63,9 @@ def test_etb_model_metadata_tracks_private_release(): from policyengine_uk_data.datasets.imputations.services.etb import ( get_public_services_model_metadata, ) + from policyengine_uk_data.datasets.imputations.services.services import ( + ETB_SURVEY_YEAR, + ) from policyengine_uk_data.datasets.imputations.vat import ( DEFAULT_ETB_YEAR, get_vat_model_metadata, @@ -72,5 +75,6 @@ def test_etb_model_metadata_tracks_private_release(): services_metadata = get_public_services_model_metadata() assert DEFAULT_ETB_YEAR == CURRENT_ETB_RELEASE.default_training_year + assert ETB_SURVEY_YEAR == CURRENT_ETB_RELEASE.default_training_year assert vat_metadata["etb_release_name"] == CURRENT_ETB_RELEASE.name assert services_metadata["etb_release_name"] == CURRENT_ETB_RELEASE.name From 5c3d0bcf285b2b58d8334743676d7713beb0fcd3 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sun, 24 May 2026 09:49:12 -0400 Subject: [PATCH 4/4] Use locked uv environment in data CI --- .github/workflows/pull_request.yaml | 10 +++++----- .github/workflows/push.yaml | 12 ++++++------ 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/.github/workflows/pull_request.yaml b/.github/workflows/pull_request.yaml index 158864a2e..1c4eccc17 100644 --- a/.github/workflows/pull_request.yaml +++ b/.github/workflows/pull_request.yaml @@ -64,14 +64,14 @@ jobs: uses: actions/setup-python@v6 with: python-version: 3.13 - - name: Install package - run: uv pip install -e ".[dev]" --system + - name: Sync locked environment + run: uv sync --frozen --all-extras - name: Download data inputs - run: make download + run: uv run --frozen make download env: HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }} - name: Build datasets - run: make data + run: uv run --frozen make data env: TESTING: "1" - name: Save calibration log (constituencies) @@ -87,4 +87,4 @@ jobs: path: la_calibration_log.csv - name: Run tests - run: make test + run: uv run --frozen make test diff --git a/.github/workflows/push.yaml b/.github/workflows/push.yaml index 973f5e91e..45143e274 100644 --- a/.github/workflows/push.yaml +++ b/.github/workflows/push.yaml @@ -52,14 +52,14 @@ jobs: with: workload_identity_provider: "projects/322898545428/locations/global/workloadIdentityPools/policyengine-research-id-pool/providers/prod-github-provider" service_account: "policyengine-research@policyengine-research.iam.gserviceaccount.com" - - name: Install package - run: uv pip install -e ".[dev]" --system + - name: Sync locked environment + run: uv sync --frozen --all-extras - name: Download data inputs - run: make download + run: uv run --frozen make download env: HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }} - name: Build datasets - run: make data + run: uv run --frozen make data - name: Save calibration log (constituencies) uses: actions/upload-artifact@v7 with: @@ -72,9 +72,9 @@ jobs: name: la_calibration_log.csv path: la_calibration_log.csv - name: Run tests - run: make test + run: uv run --frozen make test - name: Upload data - run: make upload + run: uv run --frozen make upload env: HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }} - name: Publish a git tag