Add unified PE-parity calibration with income/benefit targets

MaxGhenis · claude · MaxGhenis · commit 65fb7ac2567e · 2025-12-29T14:55:29.000-05:00
- Add unified_calibration.py: Multi-target calibrator for PE parity - Supports geographic (state, CD), income, and benefit targets - IPF-based calibration with bounded weight adjustments - 0% error on state populations, 0-4% on income/benefits - Add build_enhanced_cps.py: Extract full CPS from PE-US - 34 income/benefit columns (vs 8 in minimal CPS) - Person-level weights for proper aggregation - Update __init__.py with new exports: - PETargets, UnifiedCalibrator, calibrate_to_pe_targets Calibration results (65 targets): - 51 state populations: 0.00% error - SNAP/SSI/EITC spending: 0.00% error - Self-employment income: 4.01% error - Known gaps: capital gains (96%), dividends (73%) due to CPS limitations 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
diff --git a/docs/pe_parity_status.md b/docs/pe_parity_status.md
@@ -8,7 +8,38 @@ Tracking microplex calibration target coverage vs PolicyEngine-US.
 |--------|-----------|--------------|-----|
 | **Total Targets** | 1,764 | 1,661 | +103 |
 | **National** | 1,764 | 30 | +1,734 |
-| **State-level** | 0 | 1,631 | -1,631 |
+| **State-level** | 51 | 1,631 | -1,580 |
+
+## ✅ Working Calibration (2024-12-29)
+
+Successfully calibrating CPS to 65 targets:
+- **51 state populations**: 0% error
+- **14 income/benefit targets**: 0-4% error on most
+
+```
+Target                          Computed     Target     Error
+─────────────────────────────────────────────────────────────
+State populations (51)          331.4M       331.4M     0.00%
+rental_income                   $46.0B       $46.0B     0.00%
+self_employment_income          $418.9B      $436.4B    4.01%
+unemployment_compensation       $200.3B      $208.0B    3.72%
+taxable_pension_income          $827.6B      $827.6B    0.00%
+alimony_income                  $8.5B        $8.5B      0.00%
+snap                            $103.1B      $103.1B    0.00%
+ssi                             $78.5B       $78.5B     0.00%
+eitc                            $72.7B       $72.7B     0.00%
+```
+
+### Known Gaps (CPS Data Limitations)
+
+| Target | Error | Reason |
+|--------|-------|--------|
+| capital_gains | 96% | CPS has limited capital gains data |
+| social_security | 35% | Underreported in CPS |
+| employment_income | 21% | Underreported in CPS |
+| dividend_income | 73% | Underreported in CPS |
+
+These require income imputation (like PE's enhanced CPS) to fix.
 
 ## Target Categories
 
diff --git a/scripts/build_enhanced_cps.py b/scripts/build_enhanced_cps.py
@@ -0,0 +1,172 @@
+#!/usr/bin/env python3
+"""Build enhanced CPS microdata with full income/benefit columns from PE-US."""
+
+import pandas as pd
+import numpy as np
+from pathlib import Path
+from policyengine_us import Microsimulation
+
+
+def build_enhanced_cps(year: int = 2024) -> tuple[pd.DataFrame, pd.DataFrame]:
+    """Build enhanced CPS person and household data from PE-US.
+
+    Returns:
+        Tuple of (persons_df, households_df)
+    """
+    print(f"Loading PE-US microsimulation for {year}...")
+    sim = Microsimulation()
+
+    # Person-level variables
+    person_vars = [
+        # IDs and weights
+        'person_id', 'household_id', 'tax_unit_id', 'spm_unit_id',
+        'person_weight',
+
+        # Demographics
+        'age', 'is_male', 'race', 'is_hispanic',
+        'marital_status', 'is_disabled',
+
+        # Employment
+        'employment_income', 'self_employment_income',
+        'is_employed', 'hours_worked_per_week',
+
+        # Investment income
+        'dividend_income', 'interest_income', 'rental_income',
+        'short_term_capital_gains', 'long_term_capital_gains',
+
+        # Retirement income
+        'social_security', 'pension_income',
+        'taxable_pension_income', 'tax_exempt_pension_income',
+
+        # Other income
+        'ssi', 'unemployment_compensation', 'alimony_income',
+        'farm_income', 'partnership_s_corp_income',
+
+        # Benefits
+        'snap', 'ssi', 'tanf', 'wic',
+        'medicaid', 'medicare',
+
+        # Tax credits
+        'eitc', 'ctc', 'cdcc',
+
+        # Geography
+        'state_fips',
+    ]
+
+    # Household-level variables
+    household_vars = [
+        'household_id', 'household_weight',
+        'household_size', 'household_income',
+        'state_fips',
+    ]
+
+    # Load person data
+    print("Loading person variables...")
+    person_data = {}
+    for var in person_vars:
+        try:
+            vals = sim.calculate(var, year)
+            person_data[var] = vals
+        except Exception as e:
+            print(f"  Warning: {var} not available: {e}")
+
+    persons_df = pd.DataFrame(person_data)
+    print(f"  Persons: {len(persons_df):,} rows, {len(persons_df.columns)} columns")
+
+    # Load household data
+    print("Loading household variables...")
+    household_data = {}
+    for var in household_vars:
+        try:
+            vals = sim.calculate(var, year)
+            household_data[var] = vals
+        except Exception as e:
+            print(f"  Warning: {var} not available: {e}")
+
+    households_df = pd.DataFrame(household_data)
+    print(f"  Households: {len(households_df):,} rows, {len(households_df.columns)} columns")
+
+    return persons_df, households_df
+
+
+def compute_calibration_totals(persons_df: pd.DataFrame) -> dict:
+    """Compute weighted totals for calibration targets.
+
+    Returns:
+        Dict of target_name -> (computed_value, pe_target_value, error_pct)
+    """
+    from microplex.pe_targets import PETargets
+
+    pe = PETargets()
+    pe_national = pe.get_national_targets()
+
+    # Weight column
+    weight = persons_df.get('person_weight', pd.Series([1] * len(persons_df)))
+
+    # Mapping of PE target names to our columns
+    income_map = {
+        'employment_income': 'employment_income',
+        'self_employment_income': 'self_employment_income',
+        'social_security': 'social_security',
+        'dividend_income': 'dividend_income',
+        'interest_income': 'interest_income',
+        'rental_income': 'rental_income',
+        'pension_income': 'pension_income',
+        'ssi': 'ssi',
+        'unemployment_compensation': 'unemployment_compensation',
+    }
+
+    results = {}
+
+    for pe_name, col_name in income_map.items():
+        if col_name in persons_df.columns:
+            computed = (persons_df[col_name] * weight).sum()
+
+            # Find PE target
+            pe_row = pe_national[pe_national['name'] == pe_name]
+            if not pe_row.empty:
+                target = pe_row.iloc[0]['value']
+                error = abs(computed - target) / target * 100
+                results[pe_name] = {
+                    'computed': computed,
+                    'target': target,
+                    'error_pct': error
+                }
+
+    return results
+
+
+def main():
+    # Build enhanced CPS
+    persons_df, households_df = build_enhanced_cps(2024)
+
+    # Save to parquet
+    out_dir = Path("data")
+    persons_df.to_parquet(out_dir / "cps_enhanced_persons.parquet", index=False)
+    households_df.to_parquet(out_dir / "cps_enhanced_households.parquet", index=False)
+
+    print(f"\n✅ Saved enhanced CPS data")
+    print(f"   Persons: {out_dir / 'cps_enhanced_persons.parquet'}")
+    print(f"   Households: {out_dir / 'cps_enhanced_households.parquet'}")
+
+    # Compute and compare calibration totals
+    print("\n=== CALIBRATION COMPARISON ===")
+    results = compute_calibration_totals(persons_df)
+
+    print(f"\n{'Variable':<30} {'Computed':>15} {'Target':>15} {'Error':>10}")
+    print("-" * 75)
+
+    for name, vals in sorted(results.items(), key=lambda x: -x[1]['target']):
+        computed = vals['computed']
+        target = vals['target']
+        error = vals['error_pct']
+
+        comp_str = f"${computed/1e9:.1f}B"
+        tgt_str = f"${target/1e9:.1f}B"
+        err_str = f"{error:.1f}%"
+
+        print(f"{name:<30} {comp_str:>15} {tgt_str:>15} {err_str:>10}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/microplex/__init__.py b/src/microplex/__init__.py
@@ -81,6 +81,16 @@
     create_synthesizer,
     HAS_STATMATCH,
 )
+from microplex.pe_targets import (
+    PETargets,
+    get_pe_targets,
+    create_calibration_targets,
+)
+from microplex.unified_calibration import (
+    UnifiedCalibrator,
+    CalibrationTarget,
+    calibrate_to_pe_targets,
+)
 
 __version__ = "0.1.0"
 
@@ -138,4 +148,11 @@
     "DisabilityTransitionModel",
     "MarriageTransition",
     "DivorceTransition",
+    # PE Parity
+    "PETargets",
+    "get_pe_targets",
+    "create_calibration_targets",
+    "UnifiedCalibrator",
+    "CalibrationTarget",
+    "calibrate_to_pe_targets",
 ]
diff --git a/src/microplex/unified_calibration.py b/src/microplex/unified_calibration.py