Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions changelog_entry.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
- bump: minor
changes:
fixed:
- QRF hyperparameter tuning now correctly tunes QRF and RFC separately.
- QRF imputation now correctly handles categorical variables that become predictors after imputation.
2 changes: 1 addition & 1 deletion microimpute/comparisons/autoimpute.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
)
from microimpute.models import OLS, QRF, Imputer, QuantReg
from microimpute.utils.data import unnormalize_predictions
from microimpute.utils.type_detector import VariableTypeDetector
from microimpute.utils.type_handling import VariableTypeDetector

try:
from microimpute.models import Matching
Expand Down
2 changes: 1 addition & 1 deletion microimpute/comparisons/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
validate_quantiles,
)
from microimpute.config import QUANTILES, VALIDATE_CONFIG
from microimpute.utils.type_detector import VariableTypeDetector
from microimpute.utils.type_handling import VariableTypeDetector

log = logging.getLogger(__name__)

Expand Down
143 changes: 4 additions & 139 deletions microimpute/models/imputer.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,10 @@
from pydantic import SkipValidation, validate_call

from microimpute.config import RANDOM_STATE, VALIDATE_CONFIG
from microimpute.utils.type_detector import VariableTypeDetector
from microimpute.utils.type_handling import (
DummyVariableProcessor,
VariableTypeDetector,
)


class _ConstantValueModel:
Expand All @@ -34,144 +37,6 @@ def predict(self, X: pd.DataFrame, **kwargs) -> pd.Series:
)


class DummyVariableProcessor:
"""Handles conversion of categorical predictors to dummy variables."""

def __init__(self, logger: logging.Logger):
self.logger = logger
self.dummy_mapping = {} # Maps original column to dummy columns

def preprocess_predictors(
self,
data: pd.DataFrame,
predictors: List[str],
imputed_variables: List[str],
) -> Tuple[pd.DataFrame, List[str]]:
"""
Process only predictor variables, converting categoricals to dummies.
Imputation targets remain in original form.

Returns:
Tuple of (processed_data, updated_predictors)
"""
# Start with a copy containing all needed columns
all_columns = list(set(predictors + imputed_variables))
data = data[all_columns].copy()
detector = VariableTypeDetector()

# Identify categorical predictors only (not targets)
categorical_predictors = []
for col in predictors: # Only check predictors
if col not in data.columns:
continue
var_type, categories = detector.categorize_variable(
data[col], col, self.logger
)
if var_type in ["categorical", "numeric_categorical"]:
categorical_predictors.append(col)
self.logger.info(
f"Will create dummy variables for predictor '{col}' ({var_type})"
)

# Process categorical predictors
updated_predictors = predictors.copy()

if categorical_predictors:
# Create dummy variables for categorical predictors only
dummy_df = pd.get_dummies(
data[categorical_predictors],
columns=categorical_predictors,
dtype="float64",
drop_first=True, # Standard practice for predictors
)

# Track mapping for each original column
for orig_col in categorical_predictors:
dummy_cols = [
col
for col in dummy_df.columns
if col.startswith(f"{orig_col}_")
]
self.dummy_mapping[orig_col] = dummy_cols

# Update predictor list
updated_predictors.remove(orig_col)
updated_predictors.extend(dummy_cols)

self.logger.debug(
f"Created {len(dummy_cols)} dummy variables for '{orig_col}'"
)

# Drop original categorical columns and add dummies
data = data.drop(columns=categorical_predictors)
data = pd.concat([data, dummy_df], axis=1)

# Convert boolean predictors to float (but keep as single column)
for col in predictors:
if col in data.columns:
var_type, _ = detector.categorize_variable(
data[col], col, self.logger
)
if var_type == "bool":
data[col] = data[col].astype("float64")
self.logger.debug(
f"Converted boolean predictor '{col}' to float64"
)

return data, updated_predictors

def apply_dummy_encoding_to_test(
self,
data: pd.DataFrame,
predictors: List[str],
) -> Tuple[pd.DataFrame, List[str]]:
"""Apply same dummy encoding to test data based on training mapping."""
detector = VariableTypeDetector()
data = data.copy()
updated_predictors = predictors.copy()

# Apply dummy encoding based on stored mapping
for orig_col, dummy_cols in self.dummy_mapping.items():
if orig_col in predictors and orig_col in data.columns:
# Create dummies for this column
dummy_df = pd.get_dummies(
data[[orig_col]],
columns=[orig_col],
dtype="float64",
drop_first=False, # Don't drop first, we'll handle missing manually
)

# Ensure we have the exact dummy columns from training
for dummy_col in dummy_cols:
if dummy_col not in dummy_df.columns:
dummy_df[dummy_col] = 0.0 # Missing category gets 0

# Keep only the dummy columns from training
dummy_df = dummy_df[dummy_cols]

# Update data
data = data.drop(columns=[orig_col])
data = pd.concat([data, dummy_df], axis=1)

# Update predictor list
updated_predictors.remove(orig_col)
updated_predictors.extend(dummy_cols)

# Convert boolean predictors to float
for col in predictors:
if col in data.columns:
var_type, _ = detector.categorize_variable(
data[col], col, self.logger
)
if var_type == "bool":
data[col] = data[col].astype("float64")

return data, updated_predictors

# Note: Old reverse encoding methods removed as we now handle categorical
# targets directly through classification models


class Imputer(ABC):
"""
Abstract base class for fitting imputation models.
Expand Down
Loading