diff --git a/src/microplex_us/variables.py b/src/microplex_us/variables.py index 2385e28..b620f28 100644 --- a/src/microplex_us/variables.py +++ b/src/microplex_us/variables.py @@ -642,6 +642,16 @@ def _nonnegative_series(frame: pd.DataFrame, column: str) -> pd.Series: ) +# Share of a dividend total that is qualified when no observed qualified/ +# non-qualified breakdown is available (e.g. CPS DIV_VAL, which reports only a +# total). Basis: SOI 2015 PUF E00650/E00600 = $204.0B/$260.9B = 0.782 qualified. +# Splitting an unsplit total by this share avoids zeroing +# qualified_dividend_income on every CPS-native dividend row (which previously +# dumped 100% into non-qualified and inverted the national qualified vs +# non-qualified split relative to the SOI targets). +UNSPLIT_DIVIDEND_QUALIFIED_SHARE = 0.78 + + def normalize_dividend_columns(frame: pd.DataFrame) -> pd.DataFrame: """Normalize dividends onto an atomic basis, then derive totals.""" result = frame.copy() @@ -660,7 +670,14 @@ def normalize_dividend_columns(frame: pd.DataFrame) -> pd.DataFrame: if has_qualified and has_non_qualified: component_total = qualified + non_qualified total_only = component_total.eq(0.0) & total.gt(0.0) - non_qualified = non_qualified.where(~total_only, total) + # Allocate an unsplit total by the SOI qualified share rather than + # defaulting the whole amount to non-qualified. + qualified = qualified.where( + ~total_only, total * UNSPLIT_DIVIDEND_QUALIFIED_SHARE + ) + non_qualified = non_qualified.where( + ~total_only, total * (1.0 - UNSPLIT_DIVIDEND_QUALIFIED_SHARE) + ) component_total = qualified + non_qualified normalized_total = component_total.where(component_total.ne(0.0), total) elif has_qualified: @@ -686,8 +703,8 @@ def normalize_dividend_columns(frame: pd.DataFrame) -> pd.DataFrame: normalized_total = pd.Series(normalized_total, index=result.index, dtype=float) else: normalized_total = total.astype(float) - non_qualified = normalized_total.copy() - qualified = pd.Series(0.0, index=result.index, dtype=float) + qualified = normalized_total * UNSPLIT_DIVIDEND_QUALIFIED_SHARE + non_qualified = normalized_total * (1.0 - UNSPLIT_DIVIDEND_QUALIFIED_SHARE) result["qualified_dividend_income"] = qualified.astype(float) result["non_qualified_dividend_income"] = non_qualified.astype(float) diff --git a/tests/test_variables.py b/tests/test_variables.py index d39eb02..140d002 100644 --- a/tests/test_variables.py +++ b/tests/test_variables.py @@ -3,6 +3,7 @@ from __future__ import annotations import pandas as pd +import pytest from microplex.core import EntityType from microplex_us.variables import ( @@ -57,12 +58,41 @@ def test_normalize_dividend_columns_coalesces_sparse_total_aliases_by_row(): normalized = normalize_dividend_columns(frame) - assert normalized["qualified_dividend_income"].tolist() == [0.0, 5.0, 0.0] - assert normalized["non_qualified_dividend_income"].tolist() == [80.0, 25.0, 0.0] + # Row 0 carries only a dividend total (80) with no observed split, so it is + # allocated by the SOI qualified share instead of defaulting 100% to + # non-qualified. Rows 1-2 keep their observed components unchanged. + assert normalized["qualified_dividend_income"].tolist() == pytest.approx( + [62.4, 5.0, 0.0] + ) + assert normalized["non_qualified_dividend_income"].tolist() == pytest.approx( + [17.6, 25.0, 0.0] + ) assert normalized["ordinary_dividend_income"].tolist() == [80.0, 30.0, 0.0] assert normalized["dividend_income"].tolist() == [80.0, 30.0, 0.0] +def test_normalize_dividend_columns_splits_unsplit_total_by_qualified_share(): + # A row with only a dividend total (e.g. CPS DIV_VAL) and no qualified / + # non-qualified components must be split by the SOI qualified share, not + # left entirely non-qualified (which zeroed qualified dividends nationally + # and inverted the split vs the SOI targets). + frame = pd.DataFrame( + { + "qualified_dividend_income": [0.0], + "non_qualified_dividend_income": [0.0], + "dividend_income": [1_000.0], + } + ) + + normalized = normalize_dividend_columns(frame) + + assert normalized["qualified_dividend_income"].tolist() == pytest.approx([780.0]) + assert normalized["non_qualified_dividend_income"].tolist() == pytest.approx( + [220.0] + ) + assert normalized["dividend_income"].tolist() == pytest.approx([1_000.0]) + + def test_normalize_social_security_columns_tracks_unclassified_residual(): frame = pd.DataFrame( {