diff --git a/example/001_create_models.py b/example/001_create_models.py index 314dbbe..6192d4b 100644 --- a/example/001_create_models.py +++ b/example/001_create_models.py @@ -1,11 +1,12 @@ -from coldsnap import Data, Model +import os +import pandas as pd from sklearn import datasets from sklearn.ensemble import RandomForestClassifier from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier -import pandas as pd -import os + +from coldsnap import Data, Model iris = datasets.load_iris(as_frame=True) iris_df = pd.merge(iris.data, iris.target, how="inner", left_index=True, right_index=True) diff --git a/example/002_load_model.py b/example/002_load_model.py index dc33e32..fe10984 100644 --- a/example/002_load_model.py +++ b/example/002_load_model.py @@ -1,6 +1,7 @@ -from coldsnap import Model import matplotlib.pyplot as plt +from coldsnap import Model + if __name__ == "__main__": try: cs_model = Model.from_pickle("./tmp/iris_model.pkl.gz") diff --git a/pyproject.toml b/pyproject.toml index ad85029..a9041d7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "ColdSnap" -version = "0.1.0" +version = "0.2.0" description = "Create snapshots of machine learning models and their training data" readme = "README.md" requires-python = ">=3.10" diff --git a/setup.py b/setup.py index 1fe7061..9a69b46 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,6 @@ from setuptools import find_packages, setup -with open("README.md", "r", encoding="utf-8") as fh: +with open("README.md", encoding="utf-8") as fh: long_description = fh.read() setup( diff --git a/src/coldsnap/model.py b/src/coldsnap/model.py index 995dc00..01cb184 100644 --- a/src/coldsnap/model.py +++ b/src/coldsnap/model.py @@ -7,6 +7,7 @@ import hashlib import pickle +import warnings from typing import Literal, Optional from sklearn.base import BaseEstimator, is_classifier, is_regressor @@ -221,6 +222,11 @@ def fit(self) -> None: def predict(self, data): """Generate predictions using the fitted estimator. + Validates column order when DataFrame input is provided. If columns are present + but in incorrect order, they will be automatically reordered to match training + data with a warning. If input has no column names (e.g., numpy array), a warning + is issued about assuming the order matches training. + Args: data: Input features (array-like or pandas DataFrame). @@ -228,7 +234,8 @@ def predict(self, data): Predicted values (classification labels or regression targets). Raises: - ValueError: If no estimator is provided. + ValueError: If no estimator is provided, or if DataFrame is missing required + features. TypeError: If called on a transformer (use transform() instead). Examples: @@ -241,6 +248,45 @@ def predict(self, data): if estimator_type == "transformer": raise TypeError("Cannot call predict() on a transformer. Use transform() instead.") + # Validate and reorder columns if necessary + import pandas as pd + + if isinstance(data, pd.DataFrame): + expected_features = self.features + + if expected_features is not None: + # Check if all expected features are present + data_columns = list(data.columns) + missing_features = set(expected_features) - set(data_columns) + + if missing_features: + raise ValueError( + f"Input DataFrame is missing required features: {sorted(missing_features)}. " + f"Expected features: {expected_features}" + ) + + # Check if columns are in the correct order + # Only consider the features the model expects (ignore extra columns) + if data_columns != expected_features: + # Reorder columns to match expected order (and drop extra columns) + data = data[expected_features] + warnings.warn( + f"Input DataFrame columns have been reordered to match the expected " + f"feature order: {expected_features}. Original order was: {data_columns}", + UserWarning, + stacklevel=2, + ) + else: + # Input is array-like (e.g., numpy array) without column names + expected_features = self.features + if expected_features is not None: + warnings.warn( + "Input data has no column names. Assuming features are in the expected " + f"order: {expected_features}", + UserWarning, + stacklevel=2, + ) + return self._clf.predict(data) def transform(self, data): diff --git a/tests/test_data.py b/tests/test_data.py index 992344f..0e6467c 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -1,6 +1,7 @@ -import pytest import pandas as pd +import pytest from sklearn.model_selection import train_test_split + from coldsnap.data import Data diff --git a/tests/test_model.py b/tests/test_model.py index 0b59be9..3017039 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -1,5 +1,6 @@ import pytest from sklearn.ensemble import RandomForestClassifier + from coldsnap import Data, Model @@ -602,8 +603,8 @@ def test_estimator_property_works(sample_dataframe): def test_get_estimator_type(sample_dataframe): """Test that _get_estimator_type correctly identifies estimator types.""" - from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LinearRegression + from sklearn.preprocessing import StandardScaler data_instance = Data.from_df(sample_dataframe, "label", test_size=0.2, random_state=42) @@ -764,3 +765,140 @@ def test_features_property_consistency(sample_dataframe): # Both should have the same content (though prioritization differs) assert set(features_before) == set(features_after) + + +# Tests for column order validation in predict() +def test_predict_with_correct_column_order(model_instance): + """Test that predict works normally when columns are in correct order.""" + model_instance.fit() + X_test = model_instance.data.X_test + + # Predict should work without warnings when columns are in correct order + predictions = model_instance.predict(X_test) + + assert len(predictions) == len(X_test) + assert set(predictions).issubset(set(model_instance.data.y_train.unique())) + + +def test_predict_with_incorrect_column_order(model_instance): + """Test that predict reorders columns and warns when order is incorrect.""" + + model_instance.fit() + X_test = model_instance.data.X_test + + # Reorder columns (reverse order) + X_test_reordered = X_test[list(reversed(X_test.columns))] + + # Predict should work but issue a warning + with pytest.warns(UserWarning, match="columns have been reordered"): + predictions = model_instance.predict(X_test_reordered) + + # Predictions should still be correct + assert len(predictions) == len(X_test) + assert set(predictions).issubset(set(model_instance.data.y_train.unique())) + + +def test_predict_with_missing_columns(model_instance): + """Test that predict raises error when required columns are missing.""" + + model_instance.fit() + X_test = model_instance.data.X_test + + # Remove one column + X_test_missing = X_test.drop(columns=[X_test.columns[0]]) + + # Predict should raise ValueError + with pytest.raises(ValueError, match="missing required features"): + model_instance.predict(X_test_missing) + + +def test_predict_with_extra_columns(model_instance): + """Test that predict works with extra columns and reorders appropriately.""" + + model_instance.fit() + X_test = model_instance.data.X_test + + # Add an extra column + X_test_extra = X_test.copy() + X_test_extra["extra_feature"] = 999 + + # Predict should work and warn about reordering (due to extra column) + with pytest.warns(UserWarning, match="columns have been reordered"): + predictions = model_instance.predict(X_test_extra) + + # Predictions should still be correct + assert len(predictions) == len(X_test) + assert set(predictions).issubset(set(model_instance.data.y_train.unique())) + + +def test_predict_with_numpy_array_warns(model_instance): + """Test that predict warns when input is numpy array without column names.""" + + model_instance.fit() + X_test = model_instance.data.X_test + + # Convert to numpy array (no column names) + X_test_array = X_test.values + + # Predict should work but issue a warning about assuming order + with pytest.warns( + UserWarning, match="no column names.*Assuming features are in the expected order" + ): + predictions = model_instance.predict(X_test_array) + + # Predictions should still work + assert len(predictions) == len(X_test) + + +def test_predict_column_order_with_regressor(sample_dataframe): + """Test that column order validation works for regressors too.""" + from sklearn.linear_model import LinearRegression + + # Create regression dataset + df = sample_dataframe.copy() + df["target"] = df["feature1"] * 2 + df["feature2"] * 3 + df = df.drop(columns=["label"]) + + data_instance = Data.from_df(df, "target", test_size=0.2, random_state=42) + regressor = LinearRegression() + model = Model(data=data_instance, estimator=regressor) + model.fit() + + X_test = model.data.X_test + + # Test with reordered columns + X_test_reordered = X_test[list(reversed(X_test.columns))] + + with pytest.warns(UserWarning, match="columns have been reordered"): + predictions = model.predict(X_test_reordered) + + assert len(predictions) == len(X_test) + + +def test_predict_column_order_no_features_available(): + """Test that predict works without validation when no features are available.""" + import pandas as pd + from sklearn.ensemble import RandomForestClassifier + + # Create model without Data object (features unavailable) + clf = RandomForestClassifier(random_state=42) + model = Model(estimator=clf) + + # Manually fit on some data + X_train = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + y_train = pd.Series([0, 1, 0]) + model.clf.fit(X_train, y_train) + + # Now model has feature_names_in_ + assert model.features == ["a", "b"] + + # Test with correct order - should work + X_test = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) + predictions = model.predict(X_test) + assert len(predictions) == 2 + + # Test with incorrect order - should warn and reorder + X_test_wrong = pd.DataFrame({"b": [3, 4], "a": [1, 2]}) + with pytest.warns(UserWarning, match="columns have been reordered"): + predictions = model.predict(X_test_wrong) + assert len(predictions) == 2 diff --git a/tests/test_model_mixins.py b/tests/test_model_mixins.py index edb9d52..081ca2c 100644 --- a/tests/test_model_mixins.py +++ b/tests/test_model_mixins.py @@ -1,8 +1,9 @@ +import matplotlib import pytest from sklearn.ensemble import RandomForestClassifier +from sklearn.metrics import ConfusionMatrixDisplay, RocCurveDisplay + from coldsnap import Data, Model -from sklearn.metrics import RocCurveDisplay, ConfusionMatrixDisplay -import matplotlib matplotlib.use("Agg") # Include this line to suppress windows popping up during tests diff --git a/tests/test_regression_evaluation.py b/tests/test_regression_evaluation.py index 5b8825c..06f377d 100644 --- a/tests/test_regression_evaluation.py +++ b/tests/test_regression_evaluation.py @@ -1,7 +1,8 @@ -import pytest import numpy as np -from sklearn.linear_model import LinearRegression, Ridge, Lasso +import pytest from sklearn.ensemble import RandomForestRegressor +from sklearn.linear_model import Lasso, LinearRegression, Ridge + from coldsnap import Data, Model diff --git a/tests/test_serializable.py b/tests/test_serializable.py index e16055a..6f6414c 100644 --- a/tests/test_serializable.py +++ b/tests/test_serializable.py @@ -1,6 +1,8 @@ -import pytest -import pickle import gzip +import pickle + +import pytest + from coldsnap.serializable import ( Serializable, ) # Replace `your_module` with the actual module name diff --git a/tests/test_utils.py b/tests/test_utils.py index 44d4407..f02ecbb 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,6 +1,7 @@ -from coldsnap.utils import create_overview import pandas as pd +from coldsnap.utils import create_overview + def test_create_overview(): # Define the test data path