Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions statvar_imports/statistics_poland/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,9 @@ python3 tools/statvar_importer/stat_var_processor.py \
**For Main data run**
```bash
python3 tools/statvar_importer/stat_var_processor.py \
--input_data='statvar_imports/statistics_poland/poland_input/StatisticsPoland_input.csv' \
--input_data='statvar_imports/statistics_poland/StatisticsPoland_input.csv' \
--pv_map='statvar_imports/statistics_poland/StatisticsPoland_pvmap.csv' \
--output_path='statvar_imports/statistics_poland/poland_output/StatisticsPoland_output' \
--output_path='statvar_imports/statistics_poland/StatisticsPoland_output' \
--config_file='statvar_imports/statistics_poland/Statistics_Poland_metadata.csv' \
--existing_statvar_mcf=gs://unresolved_mcf/scripts/statvar/stat_vars.mcf
```

38 changes: 22 additions & 16 deletions statvar_imports/statistics_poland/download_input_data.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,39 @@
import pandas as pd
import os
import logging
import sys
from datetime import datetime

# Configure logging
logging.basicConfig(
level=logging.INFO,
level=logging.INFO,
format='%(levelname)s: %(message)s'
)

# Configuration
INPUT_FILE = "statvar_imports/statistics_poland/poland_data_sample/poland_raw.xlsx"
OUTPUT_DIR = "statvar_imports/statistics_poland/poland_input"
OUTPUT_FILE = os.path.join(OUTPUT_DIR, "StatisticsPoland_input.csv")
# --- FLATTENED PATH LOGIC ---
# Get the directory where THIS script is actually saved
BASE_PATH = os.path.dirname(os.path.abspath(__file__))

# Input remains in the sample subfolder
INPUT_FILE = os.path.join(BASE_PATH, "poland_data_sample/poland_raw.xlsx")

# Output is now saved directly in BASE_PATH (the root of the import folder)
# This ensures stat_var_processor.py can find it in the Cloud environment
OUTPUT_FILE = os.path.join(BASE_PATH, "StatisticsPoland_input.csv")

# Target functional age groups
TARGET_AGES = [
"0-2", "3-6", "7-12", "13-15", "16-19", "20-24",
"25-34", "35-44", "45-54", "55-64", "65 i więcej"
]

def process_poland_pivot():
# Verify input exists
if not os.path.exists(INPUT_FILE):
logging.error(f"{INPUT_FILE} not found.")
return
logging.error(f"CRITICAL ERROR: {INPUT_FILE} not found.")
# Tells the automation executor to STOP here
sys.exit(1)

logging.info(f"Starting generic processing. Saving to: {OUTPUT_FILE}")
logging.info(f"Processing data from: {INPUT_FILE}")

try:
# 1. Load the 'DANE' sheet
Expand All @@ -35,7 +43,6 @@ def process_poland_pivot():
# 2. Generic Filtering
df = df[df['Age'].isin(TARGET_AGES)]

# DYNAMIC YEAR LOGIC
current_year = datetime.now().year
available_years = sorted([y for y in df['Year'].unique() if y <= current_year])
df = df[df['Year'].isin(available_years)]
Expand All @@ -51,7 +58,6 @@ def process_poland_pivot():
'65 i więcej': '65 and more'
}

# Refactored repetitive replace calls into a loop
for col in ['Sex', 'Location', 'Name', 'Age']:
df[col] = df[col].replace(translations)

Expand All @@ -62,21 +68,21 @@ def process_poland_pivot():
values='Value'
)

# 5. Format Geographic Codes (ensuring 7-digit padding)
# 5. Format Geographic Codes
pivot_df.index = pivot_df.index.set_levels(
pivot_df.index.levels[0].astype(str).str.zfill(7), level=0
)

# 6. Save result
os.makedirs(OUTPUT_DIR, exist_ok=True)
# 6. Save result directly to BASE_PATH
# encoding='utf-8' is crucial for Polish characters
pivot_df.to_csv(OUTPUT_FILE, encoding='utf-8')

logging.info(f"SUCCESS: {OUTPUT_FILE} has been updated.")
logging.info(f"SUCCESS: {OUTPUT_FILE} created in the root directory.")
logging.info(f"Years Included: {available_years}")
logging.info(f"Total Geographies Processed: {pivot_df.shape[0]}")

except Exception as e:
logging.error(f"Processing Error: {e}")
sys.exit(1)

if __name__ == "__main__":
process_poland_pivot()
8 changes: 4 additions & 4 deletions statvar_imports/statistics_poland/manifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,15 @@
"provenance_description": "Population data for demographic variables such as population counts, age distributions, and other census-related metrics in Poland",
"scripts": [
"download_input_data.py",
"../../tools/statvar_importer/stat_var_processor.py --input_data=poland_input/StatisticsPoland_input.csv --pv_map=StatisticsPoland_pvmap.csv --config_file=Statistics_Poland_metadata.csv --output_path=poland_output/StatisticsPoland_output"
"../../tools/statvar_importer/stat_var_processor.py --input_data=StatisticsPoland_input.csv --pv_map=StatisticsPoland_pvmap.csv --config_file=Statistics_Poland_metadata.csv --output_path=StatisticsPoland_output"
],
"source_files": [
"poland_input/StatisticsPoland_input.csv"
"StatisticsPoland_input.csv"
],
"import_inputs": [
{
"template_mcf": "poland_output/StatisticsPoland_output.tmcf",
"cleaned_csv": "poland_output/StatisticsPoland_output.csv"
"template_mcf": "StatisticsPoland_output.tmcf",
"cleaned_csv": "StatisticsPoland_output.csv"
}
],
"cron_schedule": "0 0 1 1,4,7,10 *"
Expand Down
Loading