raeslab · sepro · Jan 8, 2026 · Jan 7, 2026 · Jan 7, 2026
diff --git a/example/001_create_models.py b/example/001_create_models.py
@@ -1,20 +1,21 @@
-from coldsnap import Data, Model
+import os
 
+import pandas as pd
 from sklearn import datasets
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.svm import SVC
 from sklearn.tree import DecisionTreeClassifier
-import pandas as pd
-import os
+
+from coldsnap import Data, Model
 
 iris = datasets.load_iris(as_frame=True)
 iris_df = pd.merge(iris.data, iris.target, how="inner", left_index=True, right_index=True)

 if __name__ == "__main__":
    try:
        os.mkdir("./tmp/")
    except FileExistsError:
        pass

    cs_data = Data.from_df(
        iris_df,

diff --git a/example/002_load_model.py b/example/002_load_model.py
@@ -1,6 +1,7 @@
-from coldsnap import Model
 import matplotlib.pyplot as plt
 
+from coldsnap import Model
+
 if __name__ == "__main__":
     try:
         cs_model = Model.from_pickle("./tmp/iris_model.pkl.gz")

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "ColdSnap"
-version = "0.1.0"
+version = "0.2.0"
 description = "Create snapshots of machine learning models and their training data"
 readme = "README.md"
 requires-python = ">=3.10"

diff --git a/setup.py b/setup.py
@@ -1,6 +1,6 @@
 from setuptools import find_packages, setup
 
-with open("README.md", "r", encoding="utf-8") as fh:
+with open("README.md", encoding="utf-8") as fh:
     long_description = fh.read()
 
 setup(

diff --git a/src/coldsnap/model.py b/src/coldsnap/model.py
@@ -7,6 +7,7 @@
 
 import hashlib
 import pickle
+import warnings
 from typing import Literal, Optional
 
 from sklearn.base import BaseEstimator, is_classifier, is_regressor
@@ -221,14 +222,20 @@ def fit(self) -> None:
     def predict(self, data):
         """Generate predictions using the fitted estimator.
 
+        Validates column order when DataFrame input is provided. If columns are present
+        but in incorrect order, they will be automatically reordered to match training
+        data with a warning. If input has no column names (e.g., numpy array), a warning
+        is issued about assuming the order matches training.
+
         Args:
             data: Input features (array-like or pandas DataFrame).
 
         Returns:
             Predicted values (classification labels or regression targets).
 
         Raises:
-            ValueError: If no estimator is provided.
+            ValueError: If no estimator is provided, or if DataFrame is missing required
+                features.
             TypeError: If called on a transformer (use transform() instead).
 
         Examples:
@@ -241,6 +248,45 @@ def predict(self, data):
         if estimator_type == "transformer":
             raise TypeError("Cannot call predict() on a transformer. Use transform() instead.")
 
+        # Validate and reorder columns if necessary
+        import pandas as pd
+
+        if isinstance(data, pd.DataFrame):
+            expected_features = self.features
+
+            if expected_features is not None:
+                # Check if all expected features are present
+                data_columns = list(data.columns)
+                missing_features = set(expected_features) - set(data_columns)
+
+                if missing_features:
+                    raise ValueError(
+                        f"Input DataFrame is missing required features: {sorted(missing_features)}. "
+                        f"Expected features: {expected_features}"
+                    )
+
+                # Check if columns are in the correct order
+                # Only consider the features the model expects (ignore extra columns)
+                if data_columns != expected_features:
+                    # Reorder columns to match expected order (and drop extra columns)
+                    data = data[expected_features]
+                    warnings.warn(
+                        f"Input DataFrame columns have been reordered to match the expected "
+                        f"feature order: {expected_features}. Original order was: {data_columns}",
+                        UserWarning,
+                        stacklevel=2,
+                    )
+        else:
+            # Input is array-like (e.g., numpy array) without column names
+            expected_features = self.features
+            if expected_features is not None:
+                warnings.warn(
+                    "Input data has no column names. Assuming features are in the expected "
+                    f"order: {expected_features}",
+                    UserWarning,
+                    stacklevel=2,
+                )
+
         return self._clf.predict(data)
 
     def transform(self, data):

diff --git a/tests/test_data.py b/tests/test_data.py
@@ -1,6 +1,7 @@
-import pytest
 import pandas as pd
+import pytest
 from sklearn.model_selection import train_test_split
+
 from coldsnap.data import Data
 
 

diff --git a/tests/test_model.py b/tests/test_model.py
@@ -1,5 +1,6 @@
 import pytest
 from sklearn.ensemble import RandomForestClassifier
+
 from coldsnap import Data, Model
 
 
@@ -602,8 +603,8 @@ def test_estimator_property_works(sample_dataframe):
 
 def test_get_estimator_type(sample_dataframe):
     """Test that _get_estimator_type correctly identifies estimator types."""
-    from sklearn.preprocessing import StandardScaler
     from sklearn.linear_model import LinearRegression
+    from sklearn.preprocessing import StandardScaler
 
     data_instance = Data.from_df(sample_dataframe, "label", test_size=0.2, random_state=42)
 
@@ -764,3 +765,140 @@ def test_features_property_consistency(sample_dataframe):
 
     # Both should have the same content (though prioritization differs)
     assert set(features_before) == set(features_after)
+
+
+# Tests for column order validation in predict()
+def test_predict_with_correct_column_order(model_instance):
+    """Test that predict works normally when columns are in correct order."""
+    model_instance.fit()
+    X_test = model_instance.data.X_test
+
+    # Predict should work without warnings when columns are in correct order
+    predictions = model_instance.predict(X_test)
+
+    assert len(predictions) == len(X_test)
+    assert set(predictions).issubset(set(model_instance.data.y_train.unique()))
+
+
+def test_predict_with_incorrect_column_order(model_instance):
+    """Test that predict reorders columns and warns when order is incorrect."""
+
+    model_instance.fit()
+    X_test = model_instance.data.X_test
+
+    # Reorder columns (reverse order)
+    X_test_reordered = X_test[list(reversed(X_test.columns))]
+
+    # Predict should work but issue a warning
+    with pytest.warns(UserWarning, match="columns have been reordered"):
+        predictions = model_instance.predict(X_test_reordered)
+
+    # Predictions should still be correct
+    assert len(predictions) == len(X_test)
+    assert set(predictions).issubset(set(model_instance.data.y_train.unique()))
+
+
+def test_predict_with_missing_columns(model_instance):
+    """Test that predict raises error when required columns are missing."""
+
+    model_instance.fit()
+    X_test = model_instance.data.X_test
+
+    # Remove one column
+    X_test_missing = X_test.drop(columns=[X_test.columns[0]])
+
+    # Predict should raise ValueError
+    with pytest.raises(ValueError, match="missing required features"):
+        model_instance.predict(X_test_missing)
+
+
+def test_predict_with_extra_columns(model_instance):
+    """Test that predict works with extra columns and reorders appropriately."""
+
+    model_instance.fit()
+    X_test = model_instance.data.X_test
+
+    # Add an extra column
+    X_test_extra = X_test.copy()
+    X_test_extra["extra_feature"] = 999
+
+    # Predict should work and warn about reordering (due to extra column)
+    with pytest.warns(UserWarning, match="columns have been reordered"):
+        predictions = model_instance.predict(X_test_extra)
+
+    # Predictions should still be correct
+    assert len(predictions) == len(X_test)
+    assert set(predictions).issubset(set(model_instance.data.y_train.unique()))
+
+
+def test_predict_with_numpy_array_warns(model_instance):
+    """Test that predict warns when input is numpy array without column names."""
+
+    model_instance.fit()
+    X_test = model_instance.data.X_test
+
+    # Convert to numpy array (no column names)
+    X_test_array = X_test.values
+
+    # Predict should work but issue a warning about assuming order
+    with pytest.warns(
+        UserWarning, match="no column names.*Assuming features are in the expected order"
+    ):
+        predictions = model_instance.predict(X_test_array)
+
+    # Predictions should still work
+    assert len(predictions) == len(X_test)
+
+
+def test_predict_column_order_with_regressor(sample_dataframe):
+    """Test that column order validation works for regressors too."""
+    from sklearn.linear_model import LinearRegression
+
+    # Create regression dataset
+    df = sample_dataframe.copy()
+    df["target"] = df["feature1"] * 2 + df["feature2"] * 3
+    df = df.drop(columns=["label"])
+
+    data_instance = Data.from_df(df, "target", test_size=0.2, random_state=42)
+    regressor = LinearRegression()
+    model = Model(data=data_instance, estimator=regressor)
+    model.fit()
+
+    X_test = model.data.X_test
+
+    # Test with reordered columns
+    X_test_reordered = X_test[list(reversed(X_test.columns))]
+
+    with pytest.warns(UserWarning, match="columns have been reordered"):
+        predictions = model.predict(X_test_reordered)
+
+    assert len(predictions) == len(X_test)
+
+
+def test_predict_column_order_no_features_available():
+    """Test that predict works without validation when no features are available."""
+    import pandas as pd
+    from sklearn.ensemble import RandomForestClassifier
+
+    # Create model without Data object (features unavailable)
+    clf = RandomForestClassifier(random_state=42)
+    model = Model(estimator=clf)
+
+    # Manually fit on some data
+    X_train = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
+    y_train = pd.Series([0, 1, 0])
+    model.clf.fit(X_train, y_train)
+
+    # Now model has feature_names_in_
+    assert model.features == ["a", "b"]
+
+    # Test with correct order - should work
+    X_test = pd.DataFrame({"a": [1, 2], "b": [3, 4]})
+    predictions = model.predict(X_test)
+    assert len(predictions) == 2
+
+    # Test with incorrect order - should warn and reorder
+    X_test_wrong = pd.DataFrame({"b": [3, 4], "a": [1, 2]})
+    with pytest.warns(UserWarning, match="columns have been reordered"):
+        predictions = model.predict(X_test_wrong)
+    assert len(predictions) == 2
diff --git a/tests/test_model_mixins.py b/tests/test_model_mixins.py
@@ -1,8 +1,9 @@
+import matplotlib
 import pytest
 from sklearn.ensemble import RandomForestClassifier
+from sklearn.metrics import ConfusionMatrixDisplay, RocCurveDisplay
+
 from coldsnap import Data, Model
-from sklearn.metrics import RocCurveDisplay, ConfusionMatrixDisplay
-import matplotlib
 
 matplotlib.use("Agg")  # Include this line to suppress windows popping up during tests
 

diff --git a/tests/test_regression_evaluation.py b/tests/test_regression_evaluation.py
@@ -1,7 +1,8 @@
-import pytest
 import numpy as np
-from sklearn.linear_model import LinearRegression, Ridge, Lasso
+import pytest
 from sklearn.ensemble import RandomForestRegressor
+from sklearn.linear_model import Lasso, LinearRegression, Ridge
+
 from coldsnap import Data, Model
 
 
@@ -105,7 +106,7 @@

    # Metrics should be identical
    assert metrics_before.keys() == metrics_after.keys()
    for key in metrics_before.keys():
        assert np.isclose(metrics_before[key], metrics_after[key]), (
            f"{key} should be the same before and after loading"
        )

diff --git a/tests/test_serializable.py b/tests/test_serializable.py
@@ -1,6 +1,8 @@
-import pytest
-import pickle
 import gzip
+import pickle
+
+import pytest
+
 from coldsnap.serializable import (
     Serializable,
 )  # Replace `your_module` with the actual module name

diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -1,6 +1,7 @@
-from coldsnap.utils import create_overview
 import pandas as pd
 
+from coldsnap.utils import create_overview
+
 
 def test_create_overview():
     # Define the test data path