Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions example/001_create_models.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,21 @@
from coldsnap import Data, Model
import os

import pandas as pd
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
import os

from coldsnap import Data, Model

iris = datasets.load_iris(as_frame=True)
iris_df = pd.merge(iris.data, iris.target, how="inner", left_index=True, right_index=True)

if __name__ == "__main__":
try:
os.mkdir("./tmp/")
except FileExistsError:
pass

Check failure on line 18 in example/001_create_models.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (SIM105)

example/001_create_models.py:15:5: SIM105 Use `contextlib.suppress(FileExistsError)` instead of `try`-`except`-`pass`

Check failure on line 18 in example/001_create_models.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (SIM105)

example/001_create_models.py:15:5: SIM105 Use `contextlib.suppress(FileExistsError)` instead of `try`-`except`-`pass`

Check failure on line 18 in example/001_create_models.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (SIM105)

example/001_create_models.py:15:5: SIM105 Use `contextlib.suppress(FileExistsError)` instead of `try`-`except`-`pass`

Check failure on line 18 in example/001_create_models.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (SIM105)

example/001_create_models.py:15:5: SIM105 Use `contextlib.suppress(FileExistsError)` instead of `try`-`except`-`pass`

cs_data = Data.from_df(
iris_df,
Expand Down
3 changes: 2 additions & 1 deletion example/002_load_model.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from coldsnap import Model
import matplotlib.pyplot as plt

from coldsnap import Model

if __name__ == "__main__":
try:
cs_model = Model.from_pickle("./tmp/iris_model.pkl.gz")
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "ColdSnap"
version = "0.1.0"
version = "0.2.0"
description = "Create snapshots of machine learning models and their training data"
readme = "README.md"
requires-python = ">=3.10"
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from setuptools import find_packages, setup

with open("README.md", "r", encoding="utf-8") as fh:
with open("README.md", encoding="utf-8") as fh:
long_description = fh.read()

setup(
Expand Down
48 changes: 47 additions & 1 deletion src/coldsnap/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

import hashlib
import pickle
import warnings
from typing import Literal, Optional

from sklearn.base import BaseEstimator, is_classifier, is_regressor
Expand Down Expand Up @@ -221,14 +222,20 @@ def fit(self) -> None:
def predict(self, data):
"""Generate predictions using the fitted estimator.

Validates column order when DataFrame input is provided. If columns are present
but in incorrect order, they will be automatically reordered to match training
data with a warning. If input has no column names (e.g., numpy array), a warning
is issued about assuming the order matches training.

Args:
data: Input features (array-like or pandas DataFrame).

Returns:
Predicted values (classification labels or regression targets).

Raises:
ValueError: If no estimator is provided.
ValueError: If no estimator is provided, or if DataFrame is missing required
features.
TypeError: If called on a transformer (use transform() instead).

Examples:
Expand All @@ -241,6 +248,45 @@ def predict(self, data):
if estimator_type == "transformer":
raise TypeError("Cannot call predict() on a transformer. Use transform() instead.")

# Validate and reorder columns if necessary
import pandas as pd

if isinstance(data, pd.DataFrame):
expected_features = self.features

if expected_features is not None:
# Check if all expected features are present
data_columns = list(data.columns)
missing_features = set(expected_features) - set(data_columns)

if missing_features:
raise ValueError(
f"Input DataFrame is missing required features: {sorted(missing_features)}. "
f"Expected features: {expected_features}"
)

# Check if columns are in the correct order
# Only consider the features the model expects (ignore extra columns)
if data_columns != expected_features:
# Reorder columns to match expected order (and drop extra columns)
data = data[expected_features]
warnings.warn(
f"Input DataFrame columns have been reordered to match the expected "
f"feature order: {expected_features}. Original order was: {data_columns}",
UserWarning,
stacklevel=2,
)
else:
# Input is array-like (e.g., numpy array) without column names
expected_features = self.features
if expected_features is not None:
warnings.warn(
"Input data has no column names. Assuming features are in the expected "
f"order: {expected_features}",
UserWarning,
stacklevel=2,
)

return self._clf.predict(data)

def transform(self, data):
Expand Down
3 changes: 2 additions & 1 deletion tests/test_data.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import pytest
import pandas as pd
import pytest
from sklearn.model_selection import train_test_split

from coldsnap.data import Data


Expand Down
140 changes: 139 additions & 1 deletion tests/test_model.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import pytest
from sklearn.ensemble import RandomForestClassifier

from coldsnap import Data, Model


Expand Down Expand Up @@ -602,8 +603,8 @@ def test_estimator_property_works(sample_dataframe):

def test_get_estimator_type(sample_dataframe):
"""Test that _get_estimator_type correctly identifies estimator types."""
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

data_instance = Data.from_df(sample_dataframe, "label", test_size=0.2, random_state=42)

Expand Down Expand Up @@ -764,3 +765,140 @@ def test_features_property_consistency(sample_dataframe):

# Both should have the same content (though prioritization differs)
assert set(features_before) == set(features_after)


# Tests for column order validation in predict()
def test_predict_with_correct_column_order(model_instance):
"""Test that predict works normally when columns are in correct order."""
model_instance.fit()
X_test = model_instance.data.X_test

# Predict should work without warnings when columns are in correct order
predictions = model_instance.predict(X_test)

assert len(predictions) == len(X_test)
assert set(predictions).issubset(set(model_instance.data.y_train.unique()))


def test_predict_with_incorrect_column_order(model_instance):
"""Test that predict reorders columns and warns when order is incorrect."""

model_instance.fit()
X_test = model_instance.data.X_test

# Reorder columns (reverse order)
X_test_reordered = X_test[list(reversed(X_test.columns))]

# Predict should work but issue a warning
with pytest.warns(UserWarning, match="columns have been reordered"):
predictions = model_instance.predict(X_test_reordered)

# Predictions should still be correct
assert len(predictions) == len(X_test)
assert set(predictions).issubset(set(model_instance.data.y_train.unique()))


def test_predict_with_missing_columns(model_instance):
"""Test that predict raises error when required columns are missing."""

model_instance.fit()
X_test = model_instance.data.X_test

# Remove one column
X_test_missing = X_test.drop(columns=[X_test.columns[0]])

# Predict should raise ValueError
with pytest.raises(ValueError, match="missing required features"):
model_instance.predict(X_test_missing)


def test_predict_with_extra_columns(model_instance):
"""Test that predict works with extra columns and reorders appropriately."""

model_instance.fit()
X_test = model_instance.data.X_test

# Add an extra column
X_test_extra = X_test.copy()
X_test_extra["extra_feature"] = 999

# Predict should work and warn about reordering (due to extra column)
with pytest.warns(UserWarning, match="columns have been reordered"):
predictions = model_instance.predict(X_test_extra)

# Predictions should still be correct
assert len(predictions) == len(X_test)
assert set(predictions).issubset(set(model_instance.data.y_train.unique()))


def test_predict_with_numpy_array_warns(model_instance):
"""Test that predict warns when input is numpy array without column names."""

model_instance.fit()
X_test = model_instance.data.X_test

# Convert to numpy array (no column names)
X_test_array = X_test.values

# Predict should work but issue a warning about assuming order
with pytest.warns(
UserWarning, match="no column names.*Assuming features are in the expected order"
):
predictions = model_instance.predict(X_test_array)

# Predictions should still work
assert len(predictions) == len(X_test)


def test_predict_column_order_with_regressor(sample_dataframe):
"""Test that column order validation works for regressors too."""
from sklearn.linear_model import LinearRegression

# Create regression dataset
df = sample_dataframe.copy()
df["target"] = df["feature1"] * 2 + df["feature2"] * 3
df = df.drop(columns=["label"])

data_instance = Data.from_df(df, "target", test_size=0.2, random_state=42)
regressor = LinearRegression()
model = Model(data=data_instance, estimator=regressor)
model.fit()

X_test = model.data.X_test

# Test with reordered columns
X_test_reordered = X_test[list(reversed(X_test.columns))]

with pytest.warns(UserWarning, match="columns have been reordered"):
predictions = model.predict(X_test_reordered)

assert len(predictions) == len(X_test)


def test_predict_column_order_no_features_available():
"""Test that predict works without validation when no features are available."""
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

# Create model without Data object (features unavailable)
clf = RandomForestClassifier(random_state=42)
model = Model(estimator=clf)

# Manually fit on some data
X_train = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
y_train = pd.Series([0, 1, 0])
model.clf.fit(X_train, y_train)

# Now model has feature_names_in_
assert model.features == ["a", "b"]

# Test with correct order - should work
X_test = pd.DataFrame({"a": [1, 2], "b": [3, 4]})
predictions = model.predict(X_test)
assert len(predictions) == 2

# Test with incorrect order - should warn and reorder
X_test_wrong = pd.DataFrame({"b": [3, 4], "a": [1, 2]})
with pytest.warns(UserWarning, match="columns have been reordered"):
predictions = model.predict(X_test_wrong)
assert len(predictions) == 2
5 changes: 3 additions & 2 deletions tests/test_model_mixins.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import matplotlib
import pytest
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import ConfusionMatrixDisplay, RocCurveDisplay

from coldsnap import Data, Model
from sklearn.metrics import RocCurveDisplay, ConfusionMatrixDisplay
import matplotlib

matplotlib.use("Agg") # Include this line to suppress windows popping up during tests

Expand Down
5 changes: 3 additions & 2 deletions tests/test_regression_evaluation.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import pytest
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge, Lasso
import pytest
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso, LinearRegression, Ridge

from coldsnap import Data, Model


Expand Down Expand Up @@ -105,7 +106,7 @@

# Metrics should be identical
assert metrics_before.keys() == metrics_after.keys()
for key in metrics_before.keys():

Check failure on line 109 in tests/test_regression_evaluation.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (SIM118)

tests/test_regression_evaluation.py:109:9: SIM118 Use `key in dict` instead of `key in dict.keys()`

Check failure on line 109 in tests/test_regression_evaluation.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (SIM118)

tests/test_regression_evaluation.py:109:9: SIM118 Use `key in dict` instead of `key in dict.keys()`

Check failure on line 109 in tests/test_regression_evaluation.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (SIM118)

tests/test_regression_evaluation.py:109:9: SIM118 Use `key in dict` instead of `key in dict.keys()`

Check failure on line 109 in tests/test_regression_evaluation.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (SIM118)

tests/test_regression_evaluation.py:109:9: SIM118 Use `key in dict` instead of `key in dict.keys()`
assert np.isclose(metrics_before[key], metrics_after[key]), (
f"{key} should be the same before and after loading"
)
Expand Down
6 changes: 4 additions & 2 deletions tests/test_serializable.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import pytest
import pickle
import gzip
import pickle

import pytest

from coldsnap.serializable import (
Serializable,
) # Replace `your_module` with the actual module name
Expand Down
3 changes: 2 additions & 1 deletion tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from coldsnap.utils import create_overview
import pandas as pd

from coldsnap.utils import create_overview


def test_create_overview():
# Define the test data path
Expand Down
Loading