diff --git a/docs/api/models.rst b/docs/api/models.rst index 7fcb9c2e2..0556012e7 100644 --- a/docs/api/models.rst +++ b/docs/api/models.rst @@ -19,6 +19,21 @@ Functions register_model reset_model_register +Julearn custom models +--------------------- + +This is a list of models implemented by Julearn that are not simple wrappers +around existing models in other libraries but rather variants of existing +models or novel models. + +.. autosummary:: + :nosignatures: + :toctree: generated/ + :template: class.rst + + xgb_cvearlystopping.XGBClassifierCVEarlyStopping + xgb_cvearlystopping.XGBRegressorCVEarlyStopping + Dynamic Selection (DESLib) ========================== diff --git a/docs/available_pipeline_steps.rst b/docs/available_pipeline_steps.rst index 2370764e6..e6b0737ef 100644 --- a/docs/available_pipeline_steps.rst +++ b/docs/available_pipeline_steps.rst @@ -235,6 +235,20 @@ Ensemble - Y - Y - Y + * - ``xgb`` + - XGBoost + - | :class:`~xgboost.XGBClassifier` and + | :class:`~xgboost.XGBRegressor` + - Y + - Y + - Y + * - ``xgb_cvearlystopping`` + - XGBoost with Cross-Validation and Early Stopping + - | :class:`~julearn.models.xgb_cvearlystopping.XGBClassifierCVEarlyStopping` and + | :class:`~julearn.models.xgb_cvearlystopping.XGBRegressorCVEarlyStopping` + - Y + - Y + - Y Gaussian Processes ~~~~~~~~~~~~~~~~~~ diff --git a/docs/conf.py b/docs/conf.py index cab9cfe9c..4eb525573 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -231,6 +231,7 @@ None, ), "panel": ("https://panel.holoviz.org/", None), + "xgboost": ("https://xgboost.readthedocs.io/en/stable/", None), } # -- sphinx.ext.extlinks configuration --------------------------------------- diff --git a/julearn/models/available_models.py b/julearn/models/available_models.py index bf801c8a6..6ff070cda 100644 --- a/julearn/models/available_models.py +++ b/julearn/models/available_models.py @@ -46,6 +46,19 @@ ) from sklearn.svm import SVC, SVR + +try: # pragma: no cover + from xgboost import XGBClassifier, XGBRegressor + + from .xgb_cvearlystopping import ( + XGBClassifierCVEarlyStopping, + XGBRegressorCVEarlyStopping, + ) + + _has_xgboost = True +except ImportError: + _has_xgboost = False + from ..utils import logger, raise_error, warn_with_log from ..utils.logging import DelayedFmtMessage as __ from ..utils.typing import ModelLike @@ -137,6 +150,24 @@ }, } +if _has_xgboost is True: + _available_models["xgb"] = { + "regression": XGBRegressor, + "classification": XGBClassifier, + } + _available_models["xgb_cvearlystopping"] = { + "regression": XGBRegressorCVEarlyStopping, + "classification": XGBClassifierCVEarlyStopping, + } + logger.info( + "XGBoost is available and has been added to the model registry." + ) +else: + logger.info( + "XGBoost is not available and has not been added to the model " + "registry. To use XGBoost models, please install the xgboost package." + ) + _available_models_reset = deepcopy(_available_models) diff --git a/julearn/models/tests/test_xgb_cvearlystopping.py b/julearn/models/tests/test_xgb_cvearlystopping.py new file mode 100644 index 000000000..1119bf38c --- /dev/null +++ b/julearn/models/tests/test_xgb_cvearlystopping.py @@ -0,0 +1,476 @@ +"""Provide tests for XGBEarlyStoppingCV.""" + +# Authors: Federico Raimondo +# License: AGPL + +import pandas as pd +import pytest +from sklearn.utils.validation import _is_fitted + +from julearn.models.xgb_cvearlystopping import ( + XGBClassifierCVEarlyStopping, + XGBRegressorCVEarlyStopping, +) + + +def test_XGBRegressorCVEarlyStopping_grouped(df_iris) -> None: + """Test XGBRegressorCVEarlyStopping with grouped data. + + Parameters + ---------- + df_iris : pd.DataFrame + The iris dataset as a DataFrame. + + """ + X = ["sepal_length", "sepal_width", "petal_width"] + y = "petal_length" + n_groups = 20 + bins = pd.cut( + df_iris.index.values, labels=list(range(n_groups)), bins=n_groups + ) + df_iris["group"] = bins.astype(int) + + model = XGBRegressorCVEarlyStopping( + test_size=0.2, early_stopping_rounds=5, random_state=42 + ) + + assert _is_fitted(model) is False + assert not hasattr(model, "_grouped_cv") + model.fit(df_iris[X], df_iris[y], groups=df_iris["group"]) + assert _is_fitted(model) + assert hasattr(model, "_grouped_cv") + assert model._grouped_cv is True + assert model._model.get_params()["num_parallel_tree"] is None + + # Check that the model was refit with the best number of iterations + assert model._model.get_params()["early_stopping_rounds"] is None + assert model._model.get_params()["random_state"] == 42 + assert model._best_iteration is not None + assert ( + model._model.get_params()["n_estimators"] == model._best_iteration + 1 + ) + + y_pred = model.predict(df_iris[X]) + assert y_pred.shape == (len(df_iris),) + + score = model.score(df_iris[X], df_iris[y]) + assert isinstance(score, float) + + +def test_XGBRegressorCVEarlyStopping_notgrouped(df_iris) -> None: + """Test XGBRegressorCVEarlyStopping with non-grouped data. + + Parameters + ---------- + df_iris : pd.DataFrame + The iris dataset as a DataFrame. + + """ + X = ["sepal_length", "sepal_width", "petal_width"] + y = "petal_length" + + model = XGBRegressorCVEarlyStopping( + test_size=0.2, early_stopping_rounds=5, random_state=42 + ) + + assert _is_fitted(model) is False + assert not hasattr(model, "_grouped_cv") + model.fit(df_iris[X], df_iris[y]) + assert _is_fitted(model) + assert hasattr(model, "_grouped_cv") + assert model._grouped_cv is False + assert model._model.get_params()["num_parallel_tree"] is None + + # Check that the model was refit with the best number of iterations + assert model._model.get_params()["early_stopping_rounds"] is None + assert model._model.get_params()["random_state"] == 42 + assert model._best_iteration is not None + assert ( + model._model.get_params()["n_estimators"] == model._best_iteration + 1 + ) + + +def test_XGBRegressorCVEarlyStopping_numpy(df_iris) -> None: + """Test XGBRegressorCVEarlyStopping with numpy data. + + Parameters + ---------- + df_iris : pd.DataFrame + The iris dataset as a DataFrame. + + """ + X = ["sepal_length", "sepal_width", "petal_width"] + y = "petal_length" + + model = XGBRegressorCVEarlyStopping( + test_size=0.2, + early_stopping_rounds=5, + random_state=42, + num_parallel_tree=2, + ) + + assert _is_fitted(model) is False + assert not hasattr(model, "_grouped_cv") + model.fit(df_iris[X].values, df_iris[y].values) + assert _is_fitted(model) + assert hasattr(model, "_grouped_cv") + assert model._grouped_cv is False + + # Check that the model was refit with the best number of iterations + assert model._model.get_params()["early_stopping_rounds"] is None + assert model._model.get_params()["random_state"] == 42 + assert model._best_iteration is not None + assert model._model.get_params()["num_parallel_tree"] == 2 + assert ( + model._model.get_params()["n_estimators"] + == (model._best_iteration + 1) * 2 + ) + + +def test_XGBClassifierCVEarlyStopping_notgrouped(df_iris) -> None: + """Test XGBClassifierCVEarlyStopping with non-grouped data. + + Parameters + ---------- + df_iris : pd.DataFrame + The iris dataset as a DataFrame. + + """ + X = ["sepal_length", "sepal_width", "petal_width"] + y = "species" + + model = XGBClassifierCVEarlyStopping( + test_size=0.2, early_stopping_rounds=5, random_state=42 + ) + + assert _is_fitted(model) is False + assert not hasattr(model, "_grouped_cv") + model.fit(df_iris[X], df_iris[y]) + assert _is_fitted(model) + assert hasattr(model, "_grouped_cv") + assert model._grouped_cv is False + assert model._model.get_params()["num_parallel_tree"] is None + + # Check that the model was refit with the best number of iterations + assert model._model.get_params()["early_stopping_rounds"] is None + assert model._model.get_params()["random_state"] == 42 + assert model._best_iteration is not None + + # Three classes, so the number of trees is the best iteration times 3 + assert ( + model._model.get_params()["n_estimators"] + == (model._best_iteration + 1) * 3 + ) + + +def test_XGBClassifierCVEarlyStopping_grouped(df_iris) -> None: + """Test XGBClassifierCVEarlyStopping with grouped data. + + Parameters + ---------- + df_iris : pd.DataFrame + The iris dataset as a DataFrame. + + """ + X = ["sepal_length", "sepal_width", "petal_width"] + y = "species" + n_groups = 20 + bins = pd.cut( + df_iris.index.values, labels=list(range(n_groups)), bins=n_groups + ) + df_iris["group"] = bins.astype(int) + + model = XGBClassifierCVEarlyStopping( + test_size=0.2, early_stopping_rounds=5, random_state=42 + ) + + assert _is_fitted(model) is False + assert not hasattr(model, "_grouped_cv") + model.fit(df_iris[X], df_iris[y], groups=df_iris["group"]) + assert _is_fitted(model) + assert hasattr(model, "_grouped_cv") + assert model._grouped_cv is True + assert model.get_params()["test_size"] == 0.2 + assert model.get_params()["early_stopping_rounds"] == 5 + assert model.get_params()["random_state"] == 42 + + # Check that the model was refit with the best number of iterations + assert model._model.get_params()["early_stopping_rounds"] is None + assert model._model.get_params()["random_state"] == 42 + assert model._best_iteration is not None + + # Three classes, so the number of trees is the best iteration times 3 + assert ( + model._model.get_params()["n_estimators"] + == (model._best_iteration + 1) * 3 + ) + + y_pred = model.predict(df_iris[X]) + assert y_pred.shape == (len(df_iris),) + assert set(y_pred).issubset(set(df_iris[y])) + + y_probas = model.predict_proba(df_iris[X]) + assert y_probas.shape == (len(df_iris), 3) + assert (y_probas >= 0).all() and (y_probas <= 1).all() + + score = model.score(df_iris[X], df_iris[y]) + assert isinstance(score, float) + + +def test_XGBClassifierCVEarlyStopping_binary(df_binary) -> None: + """Test XGBClassifierCVEarlyStopping with binary classification. + + Parameters + ---------- + df_binary : pd.DataFrame + The binary classification dataset as a DataFrame. + + """ + X = ["sepal_length", "sepal_width", "petal_width"] + y = "species" + + model = XGBClassifierCVEarlyStopping( + test_size=0.2, early_stopping_rounds=5 + ) + + assert _is_fitted(model) is False + assert not hasattr(model, "_grouped_cv") + model.fit(df_binary[X], df_binary[y]) + assert _is_fitted(model) + assert hasattr(model, "_grouped_cv") + assert model._grouped_cv is False + assert model.get_params()["test_size"] == 0.2 + assert model.get_params()["early_stopping_rounds"] == 5 + # Check that the model was refit with the best number of iterations + assert model._model.get_params()["early_stopping_rounds"] is None + assert model._model.get_params()["random_state"] is None + assert model._best_iteration is not None + + # Two classes, so the number of trees is the best iteration times 2 + assert ( + model._model.get_params()["n_estimators"] + == (model._best_iteration + 1) * 2 + ) + y_pred = model.predict(df_binary[X]) + assert y_pred.shape == (len(df_binary),) + assert set(y_pred).issubset(set(df_binary[y])) + + y_probas = model.predict_proba(df_binary[X]) + assert y_probas.shape == (len(df_binary), 2) + assert (y_probas >= 0).all() and (y_probas <= 1).all() + + score = model.score(df_binary[X], df_binary[y]) + assert isinstance(score, float) + + +def test_XGBClassifierCVEarlyStopping_grouped_numpy(df_iris) -> None: + """Test XGBClassifierCVEarlyStopping with grouped data and numpy arrays. + + Parameters + ---------- + df_iris : pd.DataFrame + The iris dataset as a DataFrame. + + """ + X = ["sepal_length", "sepal_width", "petal_width"] + y = "species" + n_groups = 20 + bins = pd.cut( + df_iris.index.values, labels=list(range(n_groups)), bins=n_groups + ) + df_iris["group"] = bins.astype(int) + + model = XGBClassifierCVEarlyStopping( + test_size=0.2, early_stopping_rounds=5, random_state=42 + ) + + assert _is_fitted(model) is False + assert not hasattr(model, "_grouped_cv") + model.fit( + df_iris[X].values, + df_iris[y].values.to_numpy(), + groups=df_iris["group"].values, + ) + assert _is_fitted(model) + assert hasattr(model, "_grouped_cv") + assert model._grouped_cv is True + assert model.get_params()["test_size"] == 0.2 + assert model.get_params()["early_stopping_rounds"] == 5 + assert model.get_params()["random_state"] == 42 + + # Check that the model was refit with the best number of iterations + assert model._model.get_params()["early_stopping_rounds"] is None + assert model._model.get_params()["random_state"] == 42 + assert model._best_iteration is not None + + # Three classes, so the number of trees is the best iteration times 3 + assert ( + model._model.get_params()["n_estimators"] + == (model._best_iteration + 1) * 3 + ) + + y_pred = model.predict(df_iris[X]) + assert y_pred.shape == (len(df_iris),) + assert set(y_pred).issubset(set(df_iris[y])) + + y_probas = model.predict_proba(df_iris[X]) + assert y_probas.shape == (len(df_iris), 3) + assert (y_probas >= 0).all() and (y_probas <= 1).all() + + score = model.score(df_iris[X], df_iris[y]) + assert isinstance(score, float) + + +def test_XGBClassifierCVEarlyStopping_errors() -> None: + """Test XGBClassifierCVEarlyStopping error handling.""" + with pytest.raises(ValueError, match="early_stopping_rounds"): + model = XGBClassifierCVEarlyStopping( + test_size=0.2, early_stopping_rounds=None, random_state=42 + ) + + with pytest.raises(ValueError, match="not fitted"): + model = XGBClassifierCVEarlyStopping( + test_size=None, early_stopping_rounds=5, random_state=42 + ) + model.predict([[1, 2], [3, 4], [5, 6]]) + + with pytest.raises(ValueError, match="not fitted"): + model = XGBClassifierCVEarlyStopping( + test_size=None, early_stopping_rounds=5, random_state=42 + ) + model.predict_proba([[1, 2], [3, 4], [5, 6]]) + + +def test_XGBClassifierCVEarlyStopping_numpy(df_iris) -> None: + """Test XGBClassifierCVEarlyStopping with numpy data. + + Parameters + ---------- + df_iris : pd.DataFrame + The iris dataset as a DataFrame. + + """ + X = ["sepal_length", "sepal_width", "petal_width"] + y = "species" + + model = XGBClassifierCVEarlyStopping( + test_size=0.2, early_stopping_rounds=5, random_state=42 + ) + + assert _is_fitted(model) is False + assert not hasattr(model, "_grouped_cv") + model.fit(df_iris[X].values, df_iris[y].values) + assert _is_fitted(model) + assert hasattr(model, "_grouped_cv") + assert model._grouped_cv is False + + # Check that the model was refit with the best number of iterations + assert model._model.get_params()["early_stopping_rounds"] is None + assert model._model.get_params()["random_state"] == 42 + assert model._best_iteration is not None + + # Three classes, so the number of trees is the best iteration times 3 + assert ( + model._model.get_params()["n_estimators"] + == (model._best_iteration + 1) * 3 + ) + + y_nostring = df_iris[y].values.to_numpy() == "setosa" + + model.fit(df_iris[X].values, y_nostring) + assert _is_fitted(model) + assert hasattr(model, "_grouped_cv") + assert model._grouped_cv is False + + # Check that the model was refit with the best number of iterations + assert model._model.get_params()["early_stopping_rounds"] is None + assert model._model.get_params()["random_state"] == 42 + assert model._best_iteration is not None + + # Three classes, so the number of trees is the best iteration times 3 + assert ( + model._model.get_params()["n_estimators"] + == (model._best_iteration + 1) * 2 + ) + + y_pred = model.predict(df_iris[X].values) + assert y_pred.shape == (len(df_iris),) + assert set(y_pred).issubset(set(y_nostring)) + + y_probas = model.predict_proba(df_iris[X].values) + assert y_probas.shape == (len(df_iris), 2) + assert (y_probas >= 0).all() and (y_probas <= 1).all() + + score = model.score(df_iris[X].values, y_nostring) + assert isinstance(score, float) + + +def test_XGBClassifierCVEarlyStopping_set_params(df_iris) -> None: + """Test XGBClassifierCVEarlyStopping with grouped data. + + Parameters + ---------- + df_iris : pd.DataFrame + The iris dataset as a DataFrame. + + """ + X = ["sepal_length", "sepal_width", "petal_width"] + y = "species" + n_groups = 20 + bins = pd.cut( + df_iris.index.values, labels=list(range(n_groups)), bins=n_groups + ) + df_iris["group"] = bins.astype(int) + + model = XGBClassifierCVEarlyStopping( + test_size=0.2, early_stopping_rounds=5, random_state=42 + ) + + assert _is_fitted(model) is False + assert not hasattr(model, "_grouped_cv") + model.fit(df_iris[X], df_iris[y], groups=df_iris["group"]) + assert _is_fitted(model) + assert hasattr(model, "_grouped_cv") + assert model._grouped_cv is True + assert model.get_params()["test_size"] == 0.2 + assert model.get_params()["early_stopping_rounds"] == 5 + assert model.get_params()["random_state"] == 42 + + # Check that the model was refit with the best number of iterations + assert model._model.get_params()["early_stopping_rounds"] is None + assert model._model.get_params()["random_state"] == 42 + assert model._best_iteration is not None + + # Three classes, so the number of trees is the best iteration times 3 + assert ( + model._model.get_params()["n_estimators"] + == (model._best_iteration + 1) * 3 + ) + + model.set_params( + test_size=0.3, + early_stopping_rounds=10, + random_state=24, + num_parallel_tree=2, + ) + assert model.get_params()["test_size"] == 0.3 + assert model.get_params()["early_stopping_rounds"] == 10 + assert model.get_params()["random_state"] == 24 + assert model.get_params()["num_parallel_tree"] == 2 + model.fit(df_iris[X], df_iris[y], groups=df_iris["group"]) + assert _is_fitted(model) + assert hasattr(model, "_grouped_cv") + assert model._grouped_cv is True + assert model.get_params()["test_size"] == 0.3 + assert model.get_params()["early_stopping_rounds"] == 10 + assert model.get_params()["random_state"] == 24 + assert model.get_params()["num_parallel_tree"] == 2 + # Check that the model was refit with the best number of iterations + assert model._model.get_params()["early_stopping_rounds"] is None + assert model._model.get_params()["random_state"] == 24 + assert model._best_iteration is not None + # Three classes, so the number of trees is the best iteration times 3 + assert ( + model._model.get_params()["n_estimators"] + == (model._best_iteration + 1) * 3 * 2 + ) diff --git a/julearn/models/xgb_cvearlystopping.py b/julearn/models/xgb_cvearlystopping.py new file mode 100644 index 000000000..7146d0a73 --- /dev/null +++ b/julearn/models/xgb_cvearlystopping.py @@ -0,0 +1,382 @@ +"""Classifier wrapper for XGBoost with cross-validated early stopping.""" + +# Authors: Federico Raimondo +# License: AGPL + +from typing import Any + +import numpy as np +import pandas as pd +import sklearn +from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin +from sklearn.model_selection import ( + GroupShuffleSplit, + train_test_split, +) +from sklearn.preprocessing import LabelEncoder +from xgboost import XGBClassifier, XGBRegressor + +from julearn.utils.typing import DataLike + + +sklearn.set_config(enable_metadata_routing=True) + + +class _BaseXGBCVEarlyStopping(BaseEstimator): + """Base class for XGBoost with cross-validated early stopping. + + A wrapper for XGBoost that performs early stopping using a + cross-validation split of the data. The model is first trained on a + training set with early stopping based on a validation set, and then refit + on the full data using the best number of iterations found. + + Parameters + ---------- + base_estimator : class + The base XGBoost estimator class to use (e.g. XGBRegressor or + XGBClassifier). + test_size : float or int or None + The proportion of the data to use as the validation set for early + stopping. If groups is used on `fit`, this parameter refers to the + number of groups, otherwise it refers to the number of samples. If + float, should be between 0.0 and 1.0 and represent the + proportion of the dataset to include in the test split. If int, + represents the absolute number. If None, the value is + set to the complement of the train size. If train_size is also None, + it will be set to 0.25 in the case of non-grouped data or 0.2 for + grouped data (scikit-learn's defaults for `train_test_split` and + `GroupShuffleSplit`). + early_stopping_rounds : int + The number of rounds to use for early stopping. + **kwargs : dict + Extra keyword arguments to pass to the XGBoost estimator. + + """ + + def __init__( + self, + base_estimator: XGBRegressor | XGBClassifier, + test_size: float | int | None, + early_stopping_rounds: int, + **kwargs, + ): + self.test_size = test_size + if early_stopping_rounds is None: + raise ValueError( + "early_stopping_rounds must be set for CV early stopping." + ) + self.early_stopping_rounds = early_stopping_rounds + self.random_state = kwargs.get("random_state", None) + self.base_estimator = base_estimator + self._xgboost_kwargs = kwargs + if self._xgboost_kwargs is None: + self._xgboost_kwargs = {} + self._model = None + self._is_fitted = False + self.set_fit_request(groups=True) + + def fit( + self, + X: DataLike, # noqa: N803 + y: DataLike, + groups: DataLike | None = None, + ) -> "_BaseXGBCVEarlyStopping": + """Fit the model. + + Parameters + ---------- + X : DataLike + The data to fit the model on. + y : DataLike + The target data. + groups : DataLike or None + The group labels for the samples used while splitting the dataset + into train/test set for early stopping. If None, standard + train/test split is used, by default None. + + Returns + ------- + _BaseXGBCVEarlyStopping + The fitted model. + + """ + if groups is not None: + gss = GroupShuffleSplit( + n_splits=1, + test_size=self.test_size, + random_state=self.random_state, + ) + train_idx, test_idx = next(gss.split(X, y, groups)) + if isinstance(X, pd.DataFrame): + X_train, X_test = X.iloc[train_idx], X.iloc[test_idx] + else: + X_train, X_test = X[train_idx], X[test_idx] + + if isinstance(y, pd.Series): + y_train, y_test = y.iloc[train_idx], y.iloc[test_idx] + else: + y_train, y_test = y[train_idx], y[test_idx] + self._grouped_cv = True + else: + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=self.test_size, random_state=self.random_state + ) + self._grouped_cv = False + # Build a first model + model = self.base_estimator( + early_stopping_rounds=self.early_stopping_rounds, + **self._xgboost_kwargs, + ) + model.fit(X=X_train, y=y_train, eval_set=[(X_test, y_test)]) + + # Create a model with the max iterations set as the best epochs and + # refit on full data + t_kwargs = self._xgboost_kwargs.copy() + self._best_iteration = model.best_iteration + + num_parallel_tree = model.get_params().get("num_parallel_tree") + if num_parallel_tree is None: + num_parallel_tree = 1 + n_classes = getattr(model, "n_classes_", 1) + t_kwargs["n_estimators"] = ( + (self._best_iteration + 1) * num_parallel_tree * n_classes + ) + model = self.base_estimator(**t_kwargs) + model.fit(X=X, y=y) + self._model = model + self._is_fitted = True + + return self + + def predict(self, X: DataLike) -> DataLike: # noqa: N803 + """Predict using the model. + + Parameters + ---------- + X : pd.DataFrame + The data to predict on. + + Returns + ------- + DataLike + The predictions. + + """ + if self._model is None: + raise ValueError("Model not fitted") + return self._model.predict(X) + + def __sklearn_is_fitted__(self) -> bool: + """Check if the model is fitted. + + Returns + ------- + bool + True if the model is fitted, False otherwise. + + """ + return hasattr(self, "_is_fitted") and self._is_fitted + + def get_params(self, deep: bool = True) -> dict: + """Get the parameters of the model. + + Parameters + ---------- + deep : bool + If True, will return the parameters for this model and + contained subobjects that are estimators (default is True). + + Returns + ------- + params : dict + Parameter names mapped to their values. + + """ + params = { + "test_size": self.test_size, + "early_stopping_rounds": self.early_stopping_rounds, + } + params.update(self._xgboost_kwargs) + return params + + def set_params(self, **params: Any) -> "_BaseXGBCVEarlyStopping": + """Set the parameters of the model. + + Parameters + ---------- + **params : dict + Estimator parameters. + + Returns + ------- + _BaseXGBCVEarlyStopping + The model with updated parameters. + + """ + for param, value in params.items(): + if param in ["test_size", "early_stopping_rounds"]: + setattr(self, param, value) + elif param == "random_state": + self.random_state = value + self._xgboost_kwargs["random_state"] = value + else: + self._xgboost_kwargs[param] = value + return self + + +class XGBRegressorCVEarlyStopping(_BaseXGBCVEarlyStopping, RegressorMixin): + """XGBRegressor with cross-validated early stopping. + + A wrapper for XGBoost that performs early stopping using a + cross-validation split of the data. The model is first trained on a + training set with early stopping based on a validation set, and then refit + on the full data using the best number of iterations found. + + Parameters + ---------- + test_size : float or int or None + The proportion of the data to use as the validation set for early + stopping. If groups is used on `fit`, this parameter refers to the + number of groups, otherwise it refers to the number of samples. If + float, should be between 0.0 and 1.0 and represent the + proportion of the dataset to include in the test split. If int, + represents the absolute number. If None, the value is + set to the complement of the train size. If train_size is also None, + it will be set to 0.25 in the case of non-grouped data or 0.2 for + grouped data (scikit-learn's defaults for `train_test_split` and + `GroupShuffleSplit`). + early_stopping_rounds : int + The number of rounds to use for early stopping. + **kwargs : dict + Extra keyword arguments to pass to the XGBRegressor. + + """ + + def __init__( + self, + test_size: float | int | None, + early_stopping_rounds: int, + **kwargs: Any, + ): + super().__init__( + base_estimator=XGBRegressor, + test_size=test_size, + early_stopping_rounds=early_stopping_rounds, + **kwargs, + ) + + +class XGBClassifierCVEarlyStopping(_BaseXGBCVEarlyStopping, ClassifierMixin): + """XGBClassifier with cross-validated early stopping. + + A wrapper for XGBoost that performs early stopping using a + cross-validation split of the data. The model is first trained on a + training set with early stopping based on a validation set, and then refit + on the full data using the best number of iterations found. + + Parameters + ---------- + test_size : float or int or None + The proportion of the data to use as the validation set for early + stopping. If groups is used on `fit`, this parameter refers to the + number of groups, otherwise it refers to the number of samples. If + float, should be between 0.0 and 1.0 and represent the + proportion of the dataset to include in the test split. If int, + represents the absolute number. If None, the value is + set to the complement of the train size. If train_size is also None, + it will be set to 0.25 in the case of non-grouped data or 0.2 for + grouped data (scikit-learn's defaults for `train_test_split` and + `GroupShuffleSplit`). + early_stopping_rounds : int + The number of rounds to use for early stopping. + **kwargs : dict + Extra keyword arguments to pass to the XGBClassifier. + + """ + + def __init__( + self, + test_size: float | int | None, + early_stopping_rounds: int, + **kwargs: Any, + ): + super().__init__( + base_estimator=XGBClassifier, + test_size=test_size, + early_stopping_rounds=early_stopping_rounds, + **kwargs, + ) + + def fit( + self, + X: DataLike, # noqa: N803 + y: DataLike, + groups: DataLike | None = None, + ) -> "XGBClassifierCVEarlyStopping": + """Fit the model. + + Parameters + ---------- + X : DataLike + The data to fit the model on. + y : DataLike + The target data. + groups : DataLike or None + The group labels for the samples used while splitting the dataset + into train/test set for early stopping. If None, standard + train/test split is used, by default None. + + Returns + ------- + XGBClassifierCVEarlyStopping + The fitted model. + + """ + self._label_encoder = None + # Check if labels are strings and convert to integers if so, to avoid + # issues with XGBoost + if isinstance( + y, pd.Series | np.ndarray | pd.arrays.StringArray + ) and y.dtype in ["object", "string"]: + self._label_encoder = LabelEncoder() + y = self._label_encoder.fit_transform(y) # type: ignore + super().fit(X, y, groups) + self.classes_ = self._model.classes_ # type: ignore + return self + + def predict(self, X: DataLike) -> DataLike: # noqa: N803 + """Predict using the model. + + Parameters + ---------- + X : pd.DataFrame + The data to predict on. + + Returns + ------- + DataLike + The predictions. + + """ + out = super().predict(X) + if self._label_encoder is not None: + out = self._label_encoder.inverse_transform(out) + return out + + def predict_proba(self, X: DataLike) -> DataLike: # noqa: N803 + """Predict probabilities using the model. + + Parameters + ---------- + X : pd.DataFrame + The data to predict on. + + Returns + ------- + DataLike + The predictions. + + """ + if self._model is None: + raise ValueError("Model not fitted") + return self._model.predict_proba(X) diff --git a/pyproject.toml b/pyproject.toml index 7ce123698..a4ecf7f82 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -80,12 +80,14 @@ optuna = [ "optuna_integration>=4.0.0,<5.0.0", ] +xgboost = ["xgboost>=3.0.0,<4.0.0"] + # Everything we need for docs is here -docs = ["julearn[sphinx,viz,optuna,skopt]"] +docs = ["julearn[sphinx,viz,optuna,skopt,xgboost]"] # Add all optional functional dependencies (skip deslib until its fixed) # This does not include dev/docs building dependencies -all = ["julearn[viz,skopt,optuna]"] +all = ["julearn[viz,skopt,optuna,xgboost]"] ################ # Tool configs # @@ -296,11 +298,13 @@ docs = [ "sphinx-polyversion>=2.0.0", "sphinxcontrib-towncrier>=0.5.0a0", "towncrier>=25.8.0", + "xgboost>=3.0.0,<4.0.0", ] optional = [ "optuna>=4.0.0,<5.0.0", "optuna-integration>=4.0.0,<5.0.0", "scikit-optimize>=0.10.2,<0.11.0", + "xgboost>=3.0.0,<4.0.0", ] viz = [ "bokeh>=3.4.3", diff --git a/tox.ini b/tox.ini index 07a984197..1d5e78319 100644 --- a/tox.ini +++ b/tox.ini @@ -26,6 +26,7 @@ deps = scikit-optimize>=0.10.2,<0.11 optuna>=4.0.0,<5.0.0 optuna-integration>=4.0.0,<5.0.0 + xgboost>=3.0.0,<4.0.0 commands = pytest {toxinidir}/julearn @@ -62,6 +63,7 @@ deps = scikit-optimize>=0.10.2,<0.11 optuna>=4.0.0,<5.0.0 optuna-integration>=4.0.0,<5.0.0 + xgboost>=3.0.0,<4.0.0 commands = pytest -vv {toxinidir}/julearn @@ -88,6 +90,7 @@ deps = scikit-optimize>=0.10.2,<0.11 optuna>=4.0.0,<5.0.0 optuna-integration>=4.0.0,<5.0.0 + xgboost>=3.0.0,<4.0.0 commands = pytest --cov={envsitepackagesdir}/julearn --cov=./julearn --cov-report=xml --cov-report=term --cov-config=pyproject.toml -vv diff --git a/uv.lock b/uv.lock index b83739278..0fe4cea1d 100644 --- a/uv.lock +++ b/uv.lock @@ -597,6 +597,7 @@ all = [ { name = "panel" }, { name = "param" }, { name = "scikit-optimize" }, + { name = "xgboost" }, ] deslib = [ { name = "deslib" }, @@ -625,6 +626,7 @@ docs = [ { name = "sphinx-polyversion" }, { name = "sphinxcontrib-towncrier" }, { name = "towncrier" }, + { name = "xgboost" }, ] optuna = [ { name = "optuna" }, @@ -651,6 +653,9 @@ viz = [ { name = "panel" }, { name = "param" }, ] +xgboost = [ + { name = "xgboost" }, +] [package.dev-dependencies] dev = [ @@ -676,11 +681,13 @@ docs = [ { name = "sphinx-polyversion" }, { name = "sphinxcontrib-towncrier" }, { name = "towncrier" }, + { name = "xgboost" }, ] optional = [ { name = "optuna" }, { name = "optuna-integration" }, { name = "scikit-optimize" }, + { name = "xgboost" }, ] viz = [ { name = "bokeh" }, @@ -693,8 +700,8 @@ requires-dist = [ { name = "bokeh", marker = "extra == 'viz'", specifier = ">=3.0.0" }, { name = "deslib", marker = "extra == 'deslib'", specifier = ">=0.3.5,<0.4.0" }, { name = "furo", marker = "extra == 'sphinx'", specifier = ">=2025.7.19" }, - { name = "julearn", extras = ["optuna", "skopt", "sphinx", "viz"], marker = "extra == 'docs'" }, - { name = "julearn", extras = ["optuna", "skopt", "viz"], marker = "extra == 'all'" }, + { name = "julearn", extras = ["optuna", "skopt", "sphinx", "viz", "xgboost"], marker = "extra == 'docs'" }, + { name = "julearn", extras = ["optuna", "skopt", "viz", "xgboost"], marker = "extra == 'all'" }, { name = "looseversion", marker = "python_full_version >= '3.12'", specifier = "==1.3.0" }, { name = "numpy", specifier = ">=2.3.0,<3.0.0" }, { name = "numpydoc", marker = "extra == 'sphinx'", specifier = ">=1.9.0" }, @@ -719,8 +726,9 @@ requires-dist = [ { name = "towncrier", marker = "extra == 'dev'", specifier = ">=25.8.0" }, { name = "towncrier", marker = "extra == 'sphinx'", specifier = ">=25.8.0" }, { name = "tox", marker = "extra == 'dev'" }, + { name = "xgboost", marker = "extra == 'xgboost'", specifier = ">=3.0.0,<4.0.0" }, ] -provides-extras = ["dev", "sphinx", "deslib", "viz", "skopt", "optuna", "docs", "all"] +provides-extras = ["dev", "sphinx", "deslib", "viz", "skopt", "optuna", "xgboost", "docs", "all"] [package.metadata.requires-dev] dev = [ @@ -746,11 +754,13 @@ docs = [ { name = "sphinx-polyversion", specifier = ">=2.0.0" }, { name = "sphinxcontrib-towncrier", specifier = ">=0.5.0a0" }, { name = "towncrier", specifier = ">=25.8.0" }, + { name = "xgboost", specifier = ">=3.0.0,<4.0.0" }, ] optional = [ { name = "optuna", specifier = ">=4.0.0,<5.0.0" }, { name = "optuna-integration", specifier = ">=4.0.0,<5.0.0" }, { name = "scikit-optimize", specifier = ">=0.10.2,<0.11.0" }, + { name = "xgboost", specifier = ">=3.0.0,<4.0.0" }, ] viz = [ { name = "bokeh", specifier = ">=3.4.3" }, @@ -1173,6 +1183,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/62/5e/3a6a3e90f35cea3853c45e5d5fb9b7192ce4384616f932cf7591298ab6e1/numpydoc-1.10.0-py3-none-any.whl", hash = "sha256:3149da9874af890bcc2a82ef7aae5484e5aa81cb2778f08e3c307ba6d963721b", size = 69255, upload-time = "2025-12-02T16:39:11.561Z" }, ] +[[package]] +name = "nvidia-nccl-cu12" +version = "2.30.4" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/bc/2b/1757b6b74ee241de5efee3f35487dcb33e09c07605254809c6ce36aeb783/nvidia_nccl_cu12-2.30.4-py3-none-manylinux_2_18_aarch64.whl", hash = "sha256:606fa9aa9215c00367d060188eb1a5bbd28396aff5e11b9200d99d1a6ab79a71", size = 300091935, upload-time = "2026-04-23T03:22:58.024Z" }, + { url = "https://files.pythonhosted.org/packages/6b/c3/0e45ff4dce8401f6ea7c25d80d75738813a47f5ae2691e2478f2fd1e5e93/nvidia_nccl_cu12-2.30.4-py3-none-manylinux_2_18_x86_64.whl", hash = "sha256:040974b261edec4b8b793e59e92ab7176fe4ab4bc61b800f9f3bfaeec2d436f3", size = 300164158, upload-time = "2026-04-23T03:23:19.589Z" }, +] + [[package]] name = "optuna" version = "4.8.0" @@ -2286,6 +2305,24 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/41/52/e465037f5375f43533d1a80b6923955201596a99142ed524d77b571a1418/wcwidth-0.7.0-py3-none-any.whl", hash = "sha256:5d69154c429a82910e241c738cd0e2976fac8a2dd47a1a805f4afed1c0f136f2", size = 110825, upload-time = "2026-05-02T16:04:11.033Z" }, ] +[[package]] +name = "xgboost" +version = "3.2.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, + { name = "nvidia-nccl-cu12", marker = "sys_platform == 'linux'" }, + { name = "scipy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/91/bb/1eb0242409d22db725d7a88088e6cfd6556829fb0736f9ff69aa9f1e9455/xgboost-3.2.0.tar.gz", hash = "sha256:99b0e9a2a64896cdaf509c5e46372d336c692406646d20f2af505003c0c5d70d", size = 1263936, upload-time = "2026-02-10T11:03:05.542Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2d/49/6e4cdd877c24adf56cb3586bc96d93d4dcd780b5ea1efb32e1ee0de08bae/xgboost-3.2.0-py3-none-macosx_10_15_x86_64.whl", hash = "sha256:2f661966d3e322536d9c448090a870fcba1e32ee5760c10b7c46bac7a342079a", size = 2507014, upload-time = "2026-02-10T10:50:57.44Z" }, + { url = "https://files.pythonhosted.org/packages/93/f1/c09ef1add609453aa3ba5bafcd0d1c1a805c1263c0b60138ec968f8ec296/xgboost-3.2.0-py3-none-macosx_12_0_arm64.whl", hash = "sha256:eabbd40d474b8dbf6cb3536325f9150b9e6f0db32d18de9914fb3227d0bef5b7", size = 2328527, upload-time = "2026-02-10T10:51:17.502Z" }, + { url = "https://files.pythonhosted.org/packages/96/9f/d9914a7b8df842832850b1a18e5f47aaa071c217cdd1da2ae9deb291018b/xgboost-3.2.0-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:852eabc6d3b3702a59bf78dbfdcd1cb9c4d3a3b6e5ed1f8781d8b9512354fdd2", size = 131100954, upload-time = "2026-02-10T11:02:42.704Z" }, + { url = "https://files.pythonhosted.org/packages/79/98/679de17c2caa4fd3b0b4386ecf7377301702cb0afb22930a07c142fcb1d8/xgboost-3.2.0-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:99b4a6bbcb47212fec5cf5fbe12347215f073c08967431b0122cfbd1ee70312c", size = 131748579, upload-time = "2026-02-10T10:54:40.424Z" }, + { url = "https://files.pythonhosted.org/packages/1f/3d/1661dd114a914a67e3f7ab66fa1382e7599c2a8c340f314ad30a3e2b4d08/xgboost-3.2.0-py3-none-win_amd64.whl", hash = "sha256:0d169736fd836fc13646c7ab787167b3a8110351c2c6bc770c755ee1618f0442", size = 101681668, upload-time = "2026-02-10T10:59:31.202Z" }, +] + [[package]] name = "xyzservices" version = "2026.3.0"