diff --git a/statvar_imports/statistics_poland/README.md b/statvar_imports/statistics_poland/README.md index 17c62805a9..5df3b7fe82 100644 --- a/statvar_imports/statistics_poland/README.md +++ b/statvar_imports/statistics_poland/README.md @@ -38,10 +38,9 @@ python3 tools/statvar_importer/stat_var_processor.py \ **For Main data run** ```bash python3 tools/statvar_importer/stat_var_processor.py \ - --input_data='statvar_imports/statistics_poland/poland_input/StatisticsPoland_input.csv' \ + --input_data='statvar_imports/statistics_poland/StatisticsPoland_input.csv' \ --pv_map='statvar_imports/statistics_poland/StatisticsPoland_pvmap.csv' \ - --output_path='statvar_imports/statistics_poland/poland_output/StatisticsPoland_output' \ + --output_path='statvar_imports/statistics_poland/StatisticsPoland_output' \ --config_file='statvar_imports/statistics_poland/Statistics_Poland_metadata.csv' \ --existing_statvar_mcf=gs://unresolved_mcf/scripts/statvar/stat_vars.mcf ``` - diff --git a/statvar_imports/statistics_poland/download_input_data.py b/statvar_imports/statistics_poland/download_input_data.py index 691c7057b9..b12df03909 100644 --- a/statvar_imports/statistics_poland/download_input_data.py +++ b/statvar_imports/statistics_poland/download_input_data.py @@ -1,31 +1,39 @@ import pandas as pd import os import logging +import sys from datetime import datetime # Configure logging logging.basicConfig( - level=logging.INFO, + level=logging.INFO, format='%(levelname)s: %(message)s' ) -# Configuration -INPUT_FILE = "statvar_imports/statistics_poland/poland_data_sample/poland_raw.xlsx" -OUTPUT_DIR = "statvar_imports/statistics_poland/poland_input" -OUTPUT_FILE = os.path.join(OUTPUT_DIR, "StatisticsPoland_input.csv") +# --- FLATTENED PATH LOGIC --- +# Get the directory where THIS script is actually saved +BASE_PATH = os.path.dirname(os.path.abspath(__file__)) + +# Input remains in the sample subfolder +INPUT_FILE = os.path.join(BASE_PATH, "poland_data_sample/poland_raw.xlsx") + +# Output is now saved directly in BASE_PATH (the root of the import folder) +# This ensures stat_var_processor.py can find it in the Cloud environment +OUTPUT_FILE = os.path.join(BASE_PATH, "StatisticsPoland_input.csv") -# Target functional age groups TARGET_AGES = [ "0-2", "3-6", "7-12", "13-15", "16-19", "20-24", "25-34", "35-44", "45-54", "55-64", "65 i więcej" ] def process_poland_pivot(): + # Verify input exists if not os.path.exists(INPUT_FILE): - logging.error(f"{INPUT_FILE} not found.") - return + logging.error(f"CRITICAL ERROR: {INPUT_FILE} not found.") + # Tells the automation executor to STOP here + sys.exit(1) - logging.info(f"Starting generic processing. Saving to: {OUTPUT_FILE}") + logging.info(f"Processing data from: {INPUT_FILE}") try: # 1. Load the 'DANE' sheet @@ -35,7 +43,6 @@ def process_poland_pivot(): # 2. Generic Filtering df = df[df['Age'].isin(TARGET_AGES)] - # DYNAMIC YEAR LOGIC current_year = datetime.now().year available_years = sorted([y for y in df['Year'].unique() if y <= current_year]) df = df[df['Year'].isin(available_years)] @@ -51,7 +58,6 @@ def process_poland_pivot(): '65 i więcej': '65 and more' } - # Refactored repetitive replace calls into a loop for col in ['Sex', 'Location', 'Name', 'Age']: df[col] = df[col].replace(translations) @@ -62,21 +68,21 @@ def process_poland_pivot(): values='Value' ) - # 5. Format Geographic Codes (ensuring 7-digit padding) + # 5. Format Geographic Codes pivot_df.index = pivot_df.index.set_levels( pivot_df.index.levels[0].astype(str).str.zfill(7), level=0 ) - # 6. Save result - os.makedirs(OUTPUT_DIR, exist_ok=True) + # 6. Save result directly to BASE_PATH + # encoding='utf-8' is crucial for Polish characters pivot_df.to_csv(OUTPUT_FILE, encoding='utf-8') - logging.info(f"SUCCESS: {OUTPUT_FILE} has been updated.") + logging.info(f"SUCCESS: {OUTPUT_FILE} created in the root directory.") logging.info(f"Years Included: {available_years}") - logging.info(f"Total Geographies Processed: {pivot_df.shape[0]}") except Exception as e: logging.error(f"Processing Error: {e}") + sys.exit(1) if __name__ == "__main__": process_poland_pivot() diff --git a/statvar_imports/statistics_poland/manifest.json b/statvar_imports/statistics_poland/manifest.json index f1f0246b7d..137ce8342d 100644 --- a/statvar_imports/statistics_poland/manifest.json +++ b/statvar_imports/statistics_poland/manifest.json @@ -9,15 +9,15 @@ "provenance_description": "Population data for demographic variables such as population counts, age distributions, and other census-related metrics in Poland", "scripts": [ "download_input_data.py", - "../../tools/statvar_importer/stat_var_processor.py --input_data=poland_input/StatisticsPoland_input.csv --pv_map=StatisticsPoland_pvmap.csv --config_file=Statistics_Poland_metadata.csv --output_path=poland_output/StatisticsPoland_output" + "../../tools/statvar_importer/stat_var_processor.py --input_data=StatisticsPoland_input.csv --pv_map=StatisticsPoland_pvmap.csv --config_file=Statistics_Poland_metadata.csv --output_path=StatisticsPoland_output" ], "source_files": [ - "poland_input/StatisticsPoland_input.csv" + "StatisticsPoland_input.csv" ], "import_inputs": [ { - "template_mcf": "poland_output/StatisticsPoland_output.tmcf", - "cleaned_csv": "poland_output/StatisticsPoland_output.csv" + "template_mcf": "StatisticsPoland_output.tmcf", + "cleaned_csv": "StatisticsPoland_output.csv" } ], "cron_schedule": "0 0 1 1,4,7,10 *"