From 513d950ca4dfe06db3d700d0d5489825b031e83f Mon Sep 17 00:00:00 2001 From: Fede Date: Thu, 21 May 2026 16:40:33 +0300 Subject: [PATCH 01/24] [WIP] XGBEarlyStoppingCV --- julearn/models/xgb_earlystopping.py | 142 ++++++++++++++++++++++++++++ 1 file changed, 142 insertions(+) create mode 100644 julearn/models/xgb_earlystopping.py diff --git a/julearn/models/xgb_earlystopping.py b/julearn/models/xgb_earlystopping.py new file mode 100644 index 000000000..3a6ac073e --- /dev/null +++ b/julearn/models/xgb_earlystopping.py @@ -0,0 +1,142 @@ +# Classifier wrapper for XGBoost with cross-validated early stopping + +import copy +import inspect +from typing import Any, Dict, Self + +import numpy as np +import pandas as pd +import sklearn +from sklearn.base import BaseEstimator, ClassifierMixin +from sklearn.model_selection import ( + GroupShuffleSplit, + train_test_split, +) +from xgboost import XGBClassifier, XGBRegressor∏ + + +sklearn.set_config(enable_metadata_routing=True) + + +class _BaseXGBCVEarlyStopping(BaseEstimator): + def __init__(self, base_estimator, test_size, early_stopping_rounds, **kwargs): + self.test_size = test_size + if early_stopping_rounds is None: + raise ValueError( + "early_stopping_rounds must be set for CV early stopping." + ) + self.early_stopping_rounds = early_stopping_rounds + self.random_state = kwargs.get("random_state", None) + self.base_estimator = base_estimator + self._xgboost_kwargs = kwargs + if self._xgboost_kwargs is None: + self._xgboost_kwargs = {} + self._model = None + self._is_fitted = False + self.set_fit_request(groups=True) + + def fit(self, X, y, groups=None): + # groups = kwargs.get("groups", None) + if groups is not None: + print("Using groups for early stopping CV.") + gss = GroupShuffleSplit( + n_splits=1, + test_size=self.test_size, + random_state=self.random_state, + ) + train_idx, test_idx = next(gss.split(X, y, groups)) + if isinstance(X, pd.DataFrame) or isinstance(X, pd.Series): + X_train, X_test = X.iloc[train_idx], X.iloc[test_idx] + y_train, y_test = y.iloc[train_idx], y.iloc[test_idx] + else: + X_train, X_test = X[train_idx], X[test_idx] + y_train, y_test = y[train_idx], y[test_idx] + else: + print("Not using groups for early stopping CV.") + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=self.test_size, random_state=self.random_state + ) + # Build a first model + model = self.base_estimator( + early_stopping_rounds=self.early_stopping_rounds, + **self._xgboost_kwargs, + ) + model.fit(X=X_train, y=y_train, eval_set=[(X_test, y_test)]) + + # Create a model with the max iterations set as the best epochs and refit on full data + t_kwargs = self._xgboost_kwargs.copy() + if hasattr(model, "best_ntree_limit"): + t_kwargs["n_estimators"] = model.best_ntree_limit + else: + num_parallel_tree = model.get_params().get("num_parallel_tree") + if num_parallel_tree is None: + num_parallel_tree = 1 + t_kwargs["n_estimators"] = ( + model.best_iteration + 1 + ) * num_parallel_tree + model = self.base_estimator(**t_kwargs) + model.fit(X=X, y=y) + self._model = model + self._is_fitted = True + self.classes_ = self._model.classes_ + + return self + + def predict(self, X): + if self._model is None: + raise ValueError("Model not fitted") + return self._model.predict(X) + + + def __sklearn_is_fitted__(self): + return hasattr(self, "_is_fitted") and self._is_fitted + + def get_params(self, deep=True): + params = { + "test_size": self.test_size, + "early_stopping_rounds": self.early_stopping_rounds, + } + params.update(self._xgboost_kwargs) + return params + + def set_params(self, **params) -> Self: + for param, value in params.items(): + if param in ["test_size", "early_stopping_rounds"]: + setattr(self, param, value) + elif param == "random_state": + self.random_state = value + self._xgboost_kwargs["random_state"] = value + else: + self._xgboost_kwargs[param] = value + return self + + +class XGBRegressorCVEarlyStopping( + _BaseXGBCVEarlyStopping, ClassifierMixin +): + + def __init__(self, test_size, early_stopping_rounds, **kwargs): + super().__init__( + base_estimator=XGBRegressor, + test_size=test_size, + early_stopping_rounds=early_stopping_rounds, + **kwargs + ) + + +class XGBClassifierCVEarlyStopping( + _BaseXGBCVEarlyStopping, ClassifierMixin +): + + def __init__(self, test_size, early_stopping_rounds, **kwargs): + super().__init__( + base_estimator=XGBClassifier, + test_size=test_size, + early_stopping_rounds=early_stopping_rounds, + **kwargs + ) + + def predict_proba(self, X): + if self._model is None: + raise ValueError("Model not fitted") + return self._model.predict_proba(X) \ No newline at end of file From aafb33e4124f675172a2adc58e8f48e0226a7c8d Mon Sep 17 00:00:00 2001 From: Fede Date: Fri, 22 May 2026 09:47:47 +0300 Subject: [PATCH 02/24] Add XGB and XGBCVEarlyStopping --- docs/available_pipeline_steps.rst | 7 ++ julearn/models/available_models.py | 14 +++ .../models/tests/test_xgb_cvearlystopping.py | 86 +++++++++++++++++++ ...arlystopping.py => xgb_cvearlystopping.py} | 69 ++++++++++----- pyproject.toml | 1 + uv.lock | 67 +++++++++++++++ 6 files changed, 221 insertions(+), 23 deletions(-) create mode 100644 julearn/models/tests/test_xgb_cvearlystopping.py rename julearn/models/{xgb_earlystopping.py => xgb_cvearlystopping.py} (71%) diff --git a/docs/available_pipeline_steps.rst b/docs/available_pipeline_steps.rst index 2370764e6..6e3c91407 100644 --- a/docs/available_pipeline_steps.rst +++ b/docs/available_pipeline_steps.rst @@ -235,6 +235,13 @@ Ensemble - Y - Y - Y + * - ``xgb`` + - XGBoost + - | :class:`~xgboost.XGBClassifier` and + | :class:`~xgboost.XGBRegressor` + - Y + - Y + - Y Gaussian Processes ~~~~~~~~~~~~~~~~~~ diff --git a/julearn/models/available_models.py b/julearn/models/available_models.py index bf801c8a6..f6d06997e 100644 --- a/julearn/models/available_models.py +++ b/julearn/models/available_models.py @@ -45,11 +45,16 @@ MultinomialNB, ) from sklearn.svm import SVC, SVR +from xgboost import XGBClassifier, XGBRegressor from ..utils import logger, raise_error, warn_with_log from ..utils.logging import DelayedFmtMessage as __ from ..utils.typing import ModelLike from .dynamic import DynamicSelection +from .xgb_cvearlystopping import ( + XGBClassifierCVEarlyStopping, + XGBRegressorCVEarlyStopping, +) _available_models: dict[str, dict[str, Any]] = { @@ -135,6 +140,15 @@ "regression": DummyRegressor, "classification": DummyClassifier, }, + # XGBoost + "xgb": { + "regression": XGBRegressor, + "classification": XGBClassifier, + }, + "xgb_cvearlystopping": { + "regression": XGBRegressorCVEarlyStopping, + "classification": XGBClassifierCVEarlyStopping, + }, } _available_models_reset = deepcopy(_available_models) diff --git a/julearn/models/tests/test_xgb_cvearlystopping.py b/julearn/models/tests/test_xgb_cvearlystopping.py new file mode 100644 index 000000000..e5c861529 --- /dev/null +++ b/julearn/models/tests/test_xgb_cvearlystopping.py @@ -0,0 +1,86 @@ +# Provide tests for XGBEarlyStoppingCV. + +import pandas as pd +from sklearn.utils.validation import _is_fitted + +from julearn.models.xgb_cvearlystopping import ( + XGBClassifierCVEarlyStopping, + XGBRegressorCVEarlyStopping, +) + + +def test_XGBRegressorCVEarlyStopping_grouped(df_iris) -> None: + """Test XGBRegressorCVEarlyStopping with grouped data.""" + X = ["sepal_length", "sepal_width", "petal_width"] + y = "petal_length" + n_groups = 20 + bins = pd.cut( + df_iris.index.values, labels=list(range(n_groups)), bins=n_groups + ) + df_iris["group"] = bins.astype(int) + + model = XGBRegressorCVEarlyStopping( + test_size=0.2, early_stopping_rounds=5, random_state=42 + ) + + assert _is_fitted(model) is False + assert not hasattr(model, "_grouped_cv") + model.fit(df_iris[X], df_iris[y], groups=df_iris["group"]) + assert _is_fitted(model) + assert hasattr(model, "_grouped_cv") + assert model._grouped_cv is True + +def test_XGBRegressorCVEarlyStopping_notgrouped(df_iris) -> None: + """Test XGBRegressorCVEarlyStopping with non-grouped data.""" + X = ["sepal_length", "sepal_width", "petal_width"] + y = "petal_length" + + model = XGBRegressorCVEarlyStopping( + test_size=0.2, early_stopping_rounds=5, random_state=42 + ) + + assert _is_fitted(model) is False + assert not hasattr(model, "_grouped_cv") + model.fit(df_iris[X], df_iris[y]) + assert _is_fitted(model) + assert hasattr(model, "_grouped_cv") + assert model._grouped_cv is False + + +def test_XGBClassifierCVEarlyStopping_notgrouped(df_iris) -> None: + """Test XGBClassifierCVEarlyStopping with non-grouped data.""" + X = ["sepal_length", "sepal_width", "petal_width"] + y = "species" + + model = XGBClassifierCVEarlyStopping( + test_size=0.2, early_stopping_rounds=5, random_state=42 + ) + + assert _is_fitted(model) is False + assert not hasattr(model, "_grouped_cv") + model.fit(df_iris[X], df_iris[y]) + assert _is_fitted(model) + assert hasattr(model, "_grouped_cv") + assert model._grouped_cv is False + +def test_XGBClassifierCVEarlyStopping_grouped(df_iris) -> None: + """Test XGBClassifierCVEarlyStopping with grouped data.""" + X = ["sepal_length", "sepal_width", "petal_width"] + y = "species" + n_groups = 20 + bins = pd.cut( + df_iris.index.values, labels=list(range(n_groups)), bins=n_groups + ) + df_iris["group"] = bins.astype(int) + + model = XGBClassifierCVEarlyStopping( + test_size=0.2, early_stopping_rounds=5, random_state=42 + ) + + assert _is_fitted(model) is False + assert not hasattr(model, "_grouped_cv") + model.fit(df_iris[X], df_iris[y], groups=df_iris["group"]) + assert _is_fitted(model) + assert hasattr(model, "_grouped_cv") + assert model._grouped_cv is True + diff --git a/julearn/models/xgb_earlystopping.py b/julearn/models/xgb_cvearlystopping.py similarity index 71% rename from julearn/models/xgb_earlystopping.py rename to julearn/models/xgb_cvearlystopping.py index 3a6ac073e..6af7cdcd3 100644 --- a/julearn/models/xgb_earlystopping.py +++ b/julearn/models/xgb_cvearlystopping.py @@ -1,25 +1,30 @@ # Classifier wrapper for XGBoost with cross-validated early stopping -import copy -import inspect -from typing import Any, Dict, Self +from typing import Self import numpy as np import pandas as pd import sklearn +from scipy.sparse._bsr import spmatrix from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.model_selection import ( GroupShuffleSplit, train_test_split, ) -from xgboost import XGBClassifier, XGBRegressor∏ +from xgboost import XGBClassifier, XGBRegressor sklearn.set_config(enable_metadata_routing=True) class _BaseXGBCVEarlyStopping(BaseEstimator): - def __init__(self, base_estimator, test_size, early_stopping_rounds, **kwargs): + def __init__( + self, + base_estimator, + test_size, + early_stopping_rounds, + **kwargs, + ): self.test_size = test_size if early_stopping_rounds is None: raise ValueError( @@ -36,26 +41,28 @@ def __init__(self, base_estimator, test_size, early_stopping_rounds, **kwargs): self.set_fit_request(groups=True) def fit(self, X, y, groups=None): - # groups = kwargs.get("groups", None) if groups is not None: - print("Using groups for early stopping CV.") gss = GroupShuffleSplit( n_splits=1, test_size=self.test_size, random_state=self.random_state, ) train_idx, test_idx = next(gss.split(X, y, groups)) - if isinstance(X, pd.DataFrame) or isinstance(X, pd.Series): + if isinstance(X, pd.DataFrame): X_train, X_test = X.iloc[train_idx], X.iloc[test_idx] - y_train, y_test = y.iloc[train_idx], y.iloc[test_idx] else: X_train, X_test = X[train_idx], X[test_idx] + + if isinstance(X, pd.Series): + y_train, y_test = y.iloc[train_idx], y.iloc[test_idx] + else: y_train, y_test = y[train_idx], y[test_idx] + self._grouped_cv = True else: - print("Not using groups for early stopping CV.") X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=self.test_size, random_state=self.random_state ) + self._grouped_cv = False # Build a first model model = self.base_estimator( early_stopping_rounds=self.early_stopping_rounds, @@ -78,7 +85,6 @@ def fit(self, X, y, groups=None): model.fit(X=X, y=y) self._model = model self._is_fitted = True - self.classes_ = self._model.classes_ return self @@ -87,7 +93,6 @@ def predict(self, X): raise ValueError("Model not fitted") return self._model.predict(X) - def __sklearn_is_fitted__(self): return hasattr(self, "_is_fitted") and self._is_fitted @@ -111,32 +116,50 @@ def set_params(self, **params) -> Self: return self -class XGBRegressorCVEarlyStopping( - _BaseXGBCVEarlyStopping, ClassifierMixin -): - +class XGBRegressorCVEarlyStopping(_BaseXGBCVEarlyStopping, ClassifierMixin): def __init__(self, test_size, early_stopping_rounds, **kwargs): super().__init__( base_estimator=XGBRegressor, test_size=test_size, early_stopping_rounds=early_stopping_rounds, - **kwargs + **kwargs, ) -class XGBClassifierCVEarlyStopping( - _BaseXGBCVEarlyStopping, ClassifierMixin -): - +class XGBClassifierCVEarlyStopping(_BaseXGBCVEarlyStopping, ClassifierMixin): def __init__(self, test_size, early_stopping_rounds, **kwargs): super().__init__( base_estimator=XGBClassifier, test_size=test_size, early_stopping_rounds=early_stopping_rounds, - **kwargs + **kwargs, ) + def fit(self, X, y, groups=None): + self._label_encoder = None + # Check if labels are strings and convert to integers if so, to avoid issues with XGBoost + if isinstance(y, pd.Series) and y.dtype in ["object", "string"]: + self._label_encoder = sklearn.preprocessing.LabelEncoder() + y = self._label_encoder.fit_transform(y) + elif isinstance(y, np.ndarray) and y.dtype == "object": + self._label_encoder = sklearn.preprocessing.LabelEncoder() + y = self._label_encoder.fit_transform(y) + out = super().fit(X, y, groups) + self.classes_ = self._model.classes_ + return out + + def predict(self, X): + out = super().predict(X) + if self._label_encoder is not None: + out = self._label_encoder.inverse_transform(out) + return out + def predict_proba(self, X): if self._model is None: raise ValueError("Model not fitted") - return self._model.predict_proba(X) \ No newline at end of file + return self._model.predict_proba(X) + + def score(self, X, y, sample_weight=None) -> float: + if self._label_encoder is not None: + y = self._label_encoder.transform(y) + return super().score(X, y, sample_weight) diff --git a/pyproject.toml b/pyproject.toml index 7ce123698..30ced19cb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,6 +38,7 @@ dependencies = [ "statsmodels>=0.13,<0.15", "scikit-learn>=1.5.0,<1.9.0", "looseversion==1.3.0; python_version>='3.12'", + "xgboost>=2.1.4", ] dynamic = ["version"] diff --git a/uv.lock b/uv.lock index b83739278..278d01e51 100644 --- a/uv.lock +++ b/uv.lock @@ -587,6 +587,8 @@ dependencies = [ { name = "pandas" }, { name = "scikit-learn" }, { name = "statsmodels" }, + { name = "xgboost", version = "2.1.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, + { name = "xgboost", version = "3.2.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, ] [package.optional-dependencies] @@ -719,6 +721,7 @@ requires-dist = [ { name = "towncrier", marker = "extra == 'dev'", specifier = ">=25.8.0" }, { name = "towncrier", marker = "extra == 'sphinx'", specifier = ">=25.8.0" }, { name = "tox", marker = "extra == 'dev'" }, + { name = "xgboost", specifier = ">=2.1.4" }, ] provides-extras = ["dev", "sphinx", "deslib", "viz", "skopt", "optuna", "docs", "all"] @@ -1173,6 +1176,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/62/5e/3a6a3e90f35cea3853c45e5d5fb9b7192ce4384616f932cf7591298ab6e1/numpydoc-1.10.0-py3-none-any.whl", hash = "sha256:3149da9874af890bcc2a82ef7aae5484e5aa81cb2778f08e3c307ba6d963721b", size = 69255, upload-time = "2025-12-02T16:39:11.561Z" }, ] +[[package]] +name = "nvidia-nccl-cu12" +version = "2.30.4" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/bc/2b/1757b6b74ee241de5efee3f35487dcb33e09c07605254809c6ce36aeb783/nvidia_nccl_cu12-2.30.4-py3-none-manylinux_2_18_aarch64.whl", hash = "sha256:606fa9aa9215c00367d060188eb1a5bbd28396aff5e11b9200d99d1a6ab79a71", size = 300091935, upload-time = "2026-04-23T03:22:58.024Z" }, + { url = "https://files.pythonhosted.org/packages/6b/c3/0e45ff4dce8401f6ea7c25d80d75738813a47f5ae2691e2478f2fd1e5e93/nvidia_nccl_cu12-2.30.4-py3-none-manylinux_2_18_x86_64.whl", hash = "sha256:040974b261edec4b8b793e59e92ab7176fe4ab4bc61b800f9f3bfaeec2d436f3", size = 300164158, upload-time = "2026-04-23T03:23:19.589Z" }, +] + [[package]] name = "optuna" version = "4.8.0" @@ -2286,6 +2298,61 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/41/52/e465037f5375f43533d1a80b6923955201596a99142ed524d77b571a1418/wcwidth-0.7.0-py3-none-any.whl", hash = "sha256:5d69154c429a82910e241c738cd0e2976fac8a2dd47a1a805f4afed1c0f136f2", size = 110825, upload-time = "2026-05-02T16:04:11.033Z" }, ] +[[package]] +name = "xgboost" +version = "2.1.4" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version < '3.10'", +] +dependencies = [ + { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, + { name = "nvidia-nccl-cu12", marker = "python_full_version < '3.10' and platform_machine != 'aarch64' and sys_platform == 'linux'" }, + { name = "scipy", version = "1.13.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/e2/5e/860a1ef13ce38db8c257c83e138be64bcffde8f401e84bf1e2e91838afa3/xgboost-2.1.4.tar.gz", hash = "sha256:ab84c4bbedd7fae1a26f61e9dd7897421d5b08454b51c6eb072abc1d346d08d7", size = 1091127, upload-time = "2025-02-06T18:18:20.192Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b6/fe/7a1d2342c2e93f22b41515e02b73504c7809247b16ae395bd2ee7ef11e19/xgboost-2.1.4-py3-none-macosx_10_15_x86_64.macosx_11_0_x86_64.macosx_12_0_x86_64.whl", hash = "sha256:78d88da184562deff25c820d943420342014dd55e0f4c017cc4563c2148df5ee", size = 2140692, upload-time = "2025-02-06T18:16:59.23Z" }, + { url = "https://files.pythonhosted.org/packages/f5/b6/653a70910739f127adffbefb688ebc22b51139292757de7c22b1e04ce792/xgboost-2.1.4-py3-none-macosx_12_0_arm64.whl", hash = "sha256:523db01d4e74b05c61a985028bde88a4dd380eadc97209310621996d7d5d14a7", size = 1939418, upload-time = "2025-02-06T18:17:02.494Z" }, + { url = "https://files.pythonhosted.org/packages/43/06/905fee34c10fb0d0c3baa15106413b76f360d8e958765ec57c9eddf762fa/xgboost-2.1.4-py3-none-manylinux2014_aarch64.whl", hash = "sha256:57c7e98111aceef4b689d7d2ce738564a1f7fe44237136837a47847b8b33bade", size = 4442052, upload-time = "2025-02-06T18:17:04.029Z" }, + { url = "https://files.pythonhosted.org/packages/f8/6a/41956f91ab984f2fa44529b2551d825a20d33807eba051a60d06ede2a87c/xgboost-2.1.4-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f1343a512e634822eab30d300bfc00bf777dc869d881cc74854b42173cfcdb14", size = 4533170, upload-time = "2025-02-06T18:17:05.753Z" }, + { url = "https://files.pythonhosted.org/packages/b1/53/37032dca20dae7a88ad1907f817a81f232ca6e935f0c28c98db3c0a0bd22/xgboost-2.1.4-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:d366097d0db047315736f46af852feaa907f6d7371716af741cdce488ae36d20", size = 4206715, upload-time = "2025-02-06T18:17:08.448Z" }, + { url = "https://files.pythonhosted.org/packages/e4/3c/e3a93bfa7e8693c825df5ec02a40f7ff5f0950e02198b1e85da9315a8d47/xgboost-2.1.4-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:8df6da72963969ab2bf49a520c3e147b1e15cbeddd3aa0e3e039b3532c739339", size = 223642416, upload-time = "2025-02-06T18:17:25.08Z" }, + { url = "https://files.pythonhosted.org/packages/43/80/0b5a2dfcf5b4da27b0b68d2833f05d77e1a374d43db951fca200a1f12a52/xgboost-2.1.4-py3-none-win_amd64.whl", hash = "sha256:8bbfe4fedc151b83a52edbf0de945fd94358b09a81998f2945ad330fd5f20cd6", size = 124910381, upload-time = "2025-02-06T18:17:43.202Z" }, +] + +[[package]] +name = "xgboost" +version = "3.2.0" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version >= '3.14' and sys_platform == 'win32'", + "python_full_version >= '3.14' and sys_platform == 'emscripten'", + "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32'", + "python_full_version >= '3.12' and python_full_version < '3.14' and sys_platform == 'win32'", + "python_full_version >= '3.12' and python_full_version < '3.14' and sys_platform == 'emscripten'", + "python_full_version >= '3.12' and python_full_version < '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32'", + "python_full_version == '3.11.*' and sys_platform == 'win32'", + "python_full_version == '3.11.*' and sys_platform == 'emscripten'", + "python_full_version == '3.11.*' and sys_platform != 'emscripten' and sys_platform != 'win32'", + "python_full_version == '3.10.*'", +] +dependencies = [ + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" }, + { name = "numpy", version = "2.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "nvidia-nccl-cu12", marker = "python_full_version >= '3.10' and sys_platform == 'linux'" }, + { name = "scipy", version = "1.15.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" }, + { name = "scipy", version = "1.17.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/91/bb/1eb0242409d22db725d7a88088e6cfd6556829fb0736f9ff69aa9f1e9455/xgboost-3.2.0.tar.gz", hash = "sha256:99b0e9a2a64896cdaf509c5e46372d336c692406646d20f2af505003c0c5d70d", size = 1263936, upload-time = "2026-02-10T11:03:05.542Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2d/49/6e4cdd877c24adf56cb3586bc96d93d4dcd780b5ea1efb32e1ee0de08bae/xgboost-3.2.0-py3-none-macosx_10_15_x86_64.whl", hash = "sha256:2f661966d3e322536d9c448090a870fcba1e32ee5760c10b7c46bac7a342079a", size = 2507014, upload-time = "2026-02-10T10:50:57.44Z" }, + { url = "https://files.pythonhosted.org/packages/93/f1/c09ef1add609453aa3ba5bafcd0d1c1a805c1263c0b60138ec968f8ec296/xgboost-3.2.0-py3-none-macosx_12_0_arm64.whl", hash = "sha256:eabbd40d474b8dbf6cb3536325f9150b9e6f0db32d18de9914fb3227d0bef5b7", size = 2328527, upload-time = "2026-02-10T10:51:17.502Z" }, + { url = "https://files.pythonhosted.org/packages/96/9f/d9914a7b8df842832850b1a18e5f47aaa071c217cdd1da2ae9deb291018b/xgboost-3.2.0-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:852eabc6d3b3702a59bf78dbfdcd1cb9c4d3a3b6e5ed1f8781d8b9512354fdd2", size = 131100954, upload-time = "2026-02-10T11:02:42.704Z" }, + { url = "https://files.pythonhosted.org/packages/79/98/679de17c2caa4fd3b0b4386ecf7377301702cb0afb22930a07c142fcb1d8/xgboost-3.2.0-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:99b4a6bbcb47212fec5cf5fbe12347215f073c08967431b0122cfbd1ee70312c", size = 131748579, upload-time = "2026-02-10T10:54:40.424Z" }, + { url = "https://files.pythonhosted.org/packages/1f/3d/1661dd114a914a67e3f7ab66fa1382e7599c2a8c340f314ad30a3e2b4d08/xgboost-3.2.0-py3-none-win_amd64.whl", hash = "sha256:0d169736fd836fc13646c7ab787167b3a8110351c2c6bc770c755ee1618f0442", size = 101681668, upload-time = "2026-02-10T10:59:31.202Z" }, +] + [[package]] name = "xyzservices" version = "2026.3.0" From 6b03ab801eea35deb80b69ff0f860eb4d7c6ca12 Mon Sep 17 00:00:00 2001 From: Fede Date: Fri, 22 May 2026 09:50:43 +0300 Subject: [PATCH 03/24] Modify docs for XGB CV Early stopping --- docs/available_pipeline_steps.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docs/available_pipeline_steps.rst b/docs/available_pipeline_steps.rst index 6e3c91407..089d3a4ac 100644 --- a/docs/available_pipeline_steps.rst +++ b/docs/available_pipeline_steps.rst @@ -242,6 +242,13 @@ Ensemble - Y - Y - Y + * - ``xgb_cvearlystopping`` + - XGBoost with Cross-Validation and Early Stopping + - | :class:`~julearn.models.xgb_cvearlystopping.XGBClassifierCVEarlyStopping` and + | :class:`~julearn.models.xgb_cvearlystopping.XGBRegressorCVEarlyStopping` + - Y + - Y + - Y Gaussian Processes ~~~~~~~~~~~~~~~~~~ From 99233c3ea809f9fc158447e58ee0d2a32e5af055 Mon Sep 17 00:00:00 2001 From: Fede Raimondo Date: Wed, 27 May 2026 15:04:24 +0300 Subject: [PATCH 04/24] Add docstring --- .../models/tests/test_xgb_cvearlystopping.py | 44 ++- julearn/models/xgb_cvearlystopping.py | 274 ++++++++++++++++-- 2 files changed, 282 insertions(+), 36 deletions(-) diff --git a/julearn/models/tests/test_xgb_cvearlystopping.py b/julearn/models/tests/test_xgb_cvearlystopping.py index e5c861529..0cc25379e 100644 --- a/julearn/models/tests/test_xgb_cvearlystopping.py +++ b/julearn/models/tests/test_xgb_cvearlystopping.py @@ -1,4 +1,7 @@ -# Provide tests for XGBEarlyStoppingCV. +"""Provide tests for XGBEarlyStoppingCV.""" + +# Authors: Federico Raimondo +# License: AGPL import pandas as pd from sklearn.utils.validation import _is_fitted @@ -10,7 +13,14 @@ def test_XGBRegressorCVEarlyStopping_grouped(df_iris) -> None: - """Test XGBRegressorCVEarlyStopping with grouped data.""" + """Test XGBRegressorCVEarlyStopping with grouped data. + + Parameters + ---------- + df_iris : pd.DataFrame + The iris dataset as a DataFrame. + + """ X = ["sepal_length", "sepal_width", "petal_width"] y = "petal_length" n_groups = 20 @@ -30,8 +40,16 @@ def test_XGBRegressorCVEarlyStopping_grouped(df_iris) -> None: assert hasattr(model, "_grouped_cv") assert model._grouped_cv is True + def test_XGBRegressorCVEarlyStopping_notgrouped(df_iris) -> None: - """Test XGBRegressorCVEarlyStopping with non-grouped data.""" + """Test XGBRegressorCVEarlyStopping with non-grouped data. + + Parameters + ---------- + df_iris : pd.DataFrame + The iris dataset as a DataFrame. + + """ X = ["sepal_length", "sepal_width", "petal_width"] y = "petal_length" @@ -48,7 +66,14 @@ def test_XGBRegressorCVEarlyStopping_notgrouped(df_iris) -> None: def test_XGBClassifierCVEarlyStopping_notgrouped(df_iris) -> None: - """Test XGBClassifierCVEarlyStopping with non-grouped data.""" + """Test XGBClassifierCVEarlyStopping with non-grouped data. + + Parameters + ---------- + df_iris : pd.DataFrame + The iris dataset as a DataFrame. + + """ X = ["sepal_length", "sepal_width", "petal_width"] y = "species" @@ -63,8 +88,16 @@ def test_XGBClassifierCVEarlyStopping_notgrouped(df_iris) -> None: assert hasattr(model, "_grouped_cv") assert model._grouped_cv is False + def test_XGBClassifierCVEarlyStopping_grouped(df_iris) -> None: - """Test XGBClassifierCVEarlyStopping with grouped data.""" + """Test XGBClassifierCVEarlyStopping with grouped data. + + Parameters + ---------- + df_iris : pd.DataFrame + The iris dataset as a DataFrame. + + """ X = ["sepal_length", "sepal_width", "petal_width"] y = "species" n_groups = 20 @@ -83,4 +116,3 @@ def test_XGBClassifierCVEarlyStopping_grouped(df_iris) -> None: assert _is_fitted(model) assert hasattr(model, "_grouped_cv") assert model._grouped_cv is True - diff --git a/julearn/models/xgb_cvearlystopping.py b/julearn/models/xgb_cvearlystopping.py index 6af7cdcd3..267c75398 100644 --- a/julearn/models/xgb_cvearlystopping.py +++ b/julearn/models/xgb_cvearlystopping.py @@ -1,28 +1,55 @@ -# Classifier wrapper for XGBoost with cross-validated early stopping +"""Classifier wrapper for XGBoost with cross-validated early stopping.""" -from typing import Self +# Authors: Federico Raimondo +# License: AGPL + +from typing import Any import numpy as np import pandas as pd import sklearn -from scipy.sparse._bsr import spmatrix -from sklearn.base import BaseEstimator, ClassifierMixin +from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin from sklearn.model_selection import ( GroupShuffleSplit, train_test_split, ) +from sklearn.preprocessing import LabelEncoder from xgboost import XGBClassifier, XGBRegressor +from julearn.utils.typing import DataLike + sklearn.set_config(enable_metadata_routing=True) class _BaseXGBCVEarlyStopping(BaseEstimator): + """Base class for XGBoost with cross-validated early stopping. + + A wrapper for XGBoost that performs early stopping using a + cross-validation split of the data. The model is first trained on a + training set with early stopping based on a validation set, and then refit + on the full data using the best number of iterations found. + + Parameters + ---------- + base_estimator : class + The base XGBoost estimator class to use (e.g. XGBRegressor or + XGBClassifier). + test_size : float + The proportion of the data to use as the validation set for early + stopping. + early_stopping_rounds : int + The number of rounds to use for early stopping. + **kwargs : dict + Extra keyword arguments to pass to the XGBoost estimator. + + """ + def __init__( self, - base_estimator, - test_size, - early_stopping_rounds, + base_estimator: XGBRegressor | XGBClassifier, + test_size: float | None, + early_stopping_rounds: int, **kwargs, ): self.test_size = test_size @@ -40,7 +67,31 @@ def __init__( self._is_fitted = False self.set_fit_request(groups=True) - def fit(self, X, y, groups=None): + def fit( + self, + X: DataLike, # noqa: N803 + y: DataLike, + groups: DataLike | None = None, + ) -> "_BaseXGBCVEarlyStopping": + """Fit the model. + + Parameters + ---------- + X : DataLike + The data to fit the model on. + y : DataLike + The target data. + groups : DataLike, optional + The group labels for the samples used while splitting the dataset + into train/test set for early stopping. If None, standard + train/test split is used, by default None. + + Returns + ------- + _BaseXGBCVEarlyStopping + The fitted model. + + """ if groups is not None: gss = GroupShuffleSplit( n_splits=1, @@ -53,7 +104,7 @@ def fit(self, X, y, groups=None): else: X_train, X_test = X[train_idx], X[test_idx] - if isinstance(X, pd.Series): + if isinstance(y, pd.Series): y_train, y_test = y.iloc[train_idx], y.iloc[test_idx] else: y_train, y_test = y[train_idx], y[test_idx] @@ -70,7 +121,8 @@ def fit(self, X, y, groups=None): ) model.fit(X=X_train, y=y_train, eval_set=[(X_test, y_test)]) - # Create a model with the max iterations set as the best epochs and refit on full data + # Create a model with the max iterations set as the best epochs and + # refit on full data t_kwargs = self._xgboost_kwargs.copy() if hasattr(model, "best_ntree_limit"): t_kwargs["n_estimators"] = model.best_ntree_limit @@ -88,15 +140,50 @@ def fit(self, X, y, groups=None): return self - def predict(self, X): + def predict(self, X: DataLike) -> DataLike: # noqa: N803 + """Predict using the model. + + Parameters + ---------- + X : pd.DataFrame + The data to predict on. + + Returns + ------- + DataLike + The predictions. + + """ if self._model is None: raise ValueError("Model not fitted") return self._model.predict(X) - def __sklearn_is_fitted__(self): + def __sklearn_is_fitted__(self) -> bool: + """Check if the model is fitted. + + Returns + ------- + bool + True if the model is fitted, False otherwise. + + """ return hasattr(self, "_is_fitted") and self._is_fitted - def get_params(self, deep=True): + def get_params(self, deep: bool = True) -> dict: + """Get the parameters of the model. + + Parameters + ---------- + deep : bool + If True, will return the parameters for this model and + contained subobjects that are estimators (default is True). + + Returns + ------- + params : dict + Parameter names mapped to their values. + + """ params = { "test_size": self.test_size, "early_stopping_rounds": self.early_stopping_rounds, @@ -104,7 +191,20 @@ def get_params(self, deep=True): params.update(self._xgboost_kwargs) return params - def set_params(self, **params) -> Self: + def set_params(self, **params: Any) -> "_BaseXGBCVEarlyStopping": + """Set the parameters of the model. + + Parameters + ---------- + **params : dict + Estimator parameters. + + Returns + ------- + _BaseXGBCVEarlyStopping + The model with updated parameters. + + """ for param, value in params.items(): if param in ["test_size", "early_stopping_rounds"]: setattr(self, param, value) @@ -116,8 +216,32 @@ def set_params(self, **params) -> Self: return self -class XGBRegressorCVEarlyStopping(_BaseXGBCVEarlyStopping, ClassifierMixin): - def __init__(self, test_size, early_stopping_rounds, **kwargs): +class XGBRegressorCVEarlyStopping(_BaseXGBCVEarlyStopping, RegressorMixin): + """XGBRegressor with cross-validated early stopping. + + A wrapper for XGBoost that performs early stopping using a + cross-validation split of the data. The model is first trained on a + training set with early stopping based on a validation set, and then refit + on the full data using the best number of iterations found. + + Parameters + ---------- + test_size : float + The proportion of the data to use as the validation set for early + stopping. + early_stopping_rounds : int + The number of rounds to use for early stopping. + **kwargs : dict + Extra keyword arguments to pass to the XGBRegressor. + + """ + + def __init__( + self, + test_size: float | None, + early_stopping_rounds: int, + **kwargs: Any, + ): super().__init__( base_estimator=XGBRegressor, test_size=test_size, @@ -127,7 +251,31 @@ def __init__(self, test_size, early_stopping_rounds, **kwargs): class XGBClassifierCVEarlyStopping(_BaseXGBCVEarlyStopping, ClassifierMixin): - def __init__(self, test_size, early_stopping_rounds, **kwargs): + """XGBClassifier with cross-validated early stopping. + + A wrapper for XGBoost that performs early stopping using a + cross-validation split of the data. The model is first trained on a + training set with early stopping based on a validation set, and then refit + on the full data using the best number of iterations found. + + Parameters + ---------- + test_size : float + The proportion of the data to use as the validation set for early + stopping. + early_stopping_rounds : int + The number of rounds to use for early stopping. + **kwargs : dict + Extra keyword arguments to pass to the XGBClassifier. + + """ + + def __init__( + self, + test_size: float | None, + early_stopping_rounds: int, + **kwargs: Any, + ): super().__init__( base_estimator=XGBClassifier, test_size=test_size, @@ -135,31 +283,97 @@ def __init__(self, test_size, early_stopping_rounds, **kwargs): **kwargs, ) - def fit(self, X, y, groups=None): + def fit( + self, + X: DataLike, # noqa: N803 + y: DataLike, + groups: DataLike | None = None, + ) -> "XGBClassifierCVEarlyStopping": + """Fit the model. + + Parameters + ---------- + X : DataLike + The data to fit the model on. + y : DataLike + The target data. + groups : DataLike, optional + The group labels for the samples used while splitting the dataset + into train/test set for early stopping. If None, standard + train/test split is used, by default None. + + Returns + ------- + XGBClassifierCVEarlyStopping + The fitted model. + + """ self._label_encoder = None - # Check if labels are strings and convert to integers if so, to avoid issues with XGBoost + # Check if labels are strings and convert to integers if so, to avoid + # issues with XGBoost if isinstance(y, pd.Series) and y.dtype in ["object", "string"]: - self._label_encoder = sklearn.preprocessing.LabelEncoder() - y = self._label_encoder.fit_transform(y) + self._label_encoder = LabelEncoder() + y = self._label_encoder.fit_transform(y) # type: ignore elif isinstance(y, np.ndarray) and y.dtype == "object": - self._label_encoder = sklearn.preprocessing.LabelEncoder() - y = self._label_encoder.fit_transform(y) - out = super().fit(X, y, groups) - self.classes_ = self._model.classes_ - return out + self._label_encoder = LabelEncoder() + y = self._label_encoder.fit_transform(y) # type: ignore + super().fit(X, y, groups) + self.classes_ = self._model.classes_ # type: ignore + return self + + def predict(self, X: DataLike) -> DataLike: # noqa: N803 + """Predict using the model. + + Parameters + ---------- + X : pd.DataFrame + The data to predict on. + + Returns + ------- + DataLike + The predictions. - def predict(self, X): + """ out = super().predict(X) if self._label_encoder is not None: out = self._label_encoder.inverse_transform(out) return out - def predict_proba(self, X): + def predict_proba(self, X: DataLike) -> DataLike: # noqa: N803 + """Predict probabilities using the model. + + Parameters + ---------- + X : pd.DataFrame + The data to predict on. + + Returns + ------- + DataLike + The predictions. + + """ if self._model is None: raise ValueError("Model not fitted") return self._model.predict_proba(X) - def score(self, X, y, sample_weight=None) -> float: + def score(self, X: DataLike, y: DataLike) -> float: # noqa: N803 + """Score the model. + + Parameters + ---------- + X : pd.DataFrame + The data to predict on. + y : DataLike + The true target values. + + Returns + ------- + float + The score. + + """ if self._label_encoder is not None: y = self._label_encoder.transform(y) - return super().score(X, y, sample_weight) + return super().score(X, y) From 081cc6b8f1c67d73a68727a9e9848476cc6500a6 Mon Sep 17 00:00:00 2001 From: Fede Raimondo Date: Wed, 27 May 2026 15:14:32 +0300 Subject: [PATCH 05/24] Fix uv + xgboost dependecy optional --- julearn/models/available_models.py | 30 +++++++++++---- pyproject.toml | 6 ++- uv.lock | 62 ++++++++---------------------- 3 files changed, 45 insertions(+), 53 deletions(-) diff --git a/julearn/models/available_models.py b/julearn/models/available_models.py index f6d06997e..97fdaf016 100644 --- a/julearn/models/available_models.py +++ b/julearn/models/available_models.py @@ -45,7 +45,14 @@ MultinomialNB, ) from sklearn.svm import SVC, SVR -from xgboost import XGBClassifier, XGBRegressor + + +try: # pragma: no cover + from xgboost import XGBClassifier, XGBRegressor + + _has_xgboost = True +except ImportError: + _has_xgboost = False from ..utils import logger, raise_error, warn_with_log from ..utils.logging import DelayedFmtMessage as __ @@ -140,16 +147,25 @@ "regression": DummyRegressor, "classification": DummyClassifier, }, - # XGBoost - "xgb": { +} + +if _has_xgboost is True: + _available_models["xgb"] = { "regression": XGBRegressor, "classification": XGBClassifier, - }, - "xgb_cvearlystopping": { + } + _available_models["xgb_cvearlystopping"] = { "regression": XGBRegressorCVEarlyStopping, "classification": XGBClassifierCVEarlyStopping, - }, -} + } + logger.info( + "XGBoost is available and has been added to the model registry." + ) +else: + logger.info( + "XGBoost is not available and has not been added to the model " + "registry. To use XGBoost models, please install the xgboost package." + ) _available_models_reset = deepcopy(_available_models) diff --git a/pyproject.toml b/pyproject.toml index 30ced19cb..85f95339b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -81,12 +81,15 @@ optuna = [ "optuna_integration>=4.0.0,<5.0.0", ] +xgboost = ["xgboost>=2.1.4,<4.0.0"] + + # Everything we need for docs is here docs = ["julearn[sphinx,viz,optuna,skopt]"] # Add all optional functional dependencies (skip deslib until its fixed) # This does not include dev/docs building dependencies -all = ["julearn[viz,skopt,optuna]"] +all = ["julearn[viz,skopt,optuna,xgboost]"] ################ # Tool configs # @@ -302,6 +305,7 @@ optional = [ "optuna>=4.0.0,<5.0.0", "optuna-integration>=4.0.0,<5.0.0", "scikit-optimize>=0.10.2,<0.11.0", + "xgboost>=2.1.4,<4.0.0", ] viz = [ "bokeh>=3.4.3", diff --git a/uv.lock b/uv.lock index 278d01e51..77dc9e139 100644 --- a/uv.lock +++ b/uv.lock @@ -587,8 +587,7 @@ dependencies = [ { name = "pandas" }, { name = "scikit-learn" }, { name = "statsmodels" }, - { name = "xgboost", version = "2.1.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "xgboost", version = "3.2.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, + { name = "xgboost" }, ] [package.optional-dependencies] @@ -599,6 +598,7 @@ all = [ { name = "panel" }, { name = "param" }, { name = "scikit-optimize" }, + { name = "xgboost" }, ] deslib = [ { name = "deslib" }, @@ -653,6 +653,9 @@ viz = [ { name = "panel" }, { name = "param" }, ] +xgboost = [ + { name = "xgboost" }, +] [package.dev-dependencies] dev = [ @@ -683,6 +686,7 @@ optional = [ { name = "optuna" }, { name = "optuna-integration" }, { name = "scikit-optimize" }, + { name = "xgboost" }, ] viz = [ { name = "bokeh" }, @@ -696,7 +700,7 @@ requires-dist = [ { name = "deslib", marker = "extra == 'deslib'", specifier = ">=0.3.5,<0.4.0" }, { name = "furo", marker = "extra == 'sphinx'", specifier = ">=2025.7.19" }, { name = "julearn", extras = ["optuna", "skopt", "sphinx", "viz"], marker = "extra == 'docs'" }, - { name = "julearn", extras = ["optuna", "skopt", "viz"], marker = "extra == 'all'" }, + { name = "julearn", extras = ["optuna", "skopt", "viz", "xgboost"], marker = "extra == 'all'" }, { name = "looseversion", marker = "python_full_version >= '3.12'", specifier = "==1.3.0" }, { name = "numpy", specifier = ">=2.3.0,<3.0.0" }, { name = "numpydoc", marker = "extra == 'sphinx'", specifier = ">=1.9.0" }, @@ -722,8 +726,9 @@ requires-dist = [ { name = "towncrier", marker = "extra == 'sphinx'", specifier = ">=25.8.0" }, { name = "tox", marker = "extra == 'dev'" }, { name = "xgboost", specifier = ">=2.1.4" }, + { name = "xgboost", marker = "extra == 'xgboost'", specifier = ">=2.1.4,<4.0.0" }, ] -provides-extras = ["dev", "sphinx", "deslib", "viz", "skopt", "optuna", "docs", "all"] +provides-extras = ["dev", "sphinx", "deslib", "viz", "skopt", "optuna", "xgboos", "docs", "all"] [package.metadata.requires-dev] dev = [ @@ -754,6 +759,7 @@ optional = [ { name = "optuna", specifier = ">=4.0.0,<5.0.0" }, { name = "optuna-integration", specifier = ">=4.0.0,<5.0.0" }, { name = "scikit-optimize", specifier = ">=0.10.2,<0.11.0" }, + { name = "xgboost", specifier = ">=2.1.4,<4.0.0" }, ] viz = [ { name = "bokeh", specifier = ">=3.4.3" }, @@ -1976,7 +1982,10 @@ wheels = [ name = "sphinx-polyversion" version = "2.0.0" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/75/01/7755fc4b01ff281df937f6563c190b55313a14e9b1b5ac960003933f0793/sphinx_polyversion-2.0.0.tar.gz", hash = "sha256:ce5d15bbf5d2003aaec9e25c1646ec4f8a91cd55dac89df60ff17bd15630f926", size = 32265, upload-time = "2025-09-25T15:15:13.035Z" } +dependencies = [ + { name = "sphinx" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/9a/10/25231164a97a9016bdc73a3530af8f4a6846bdc564af1460af2ff3e59a50/sphinx-multiversion-0.2.4.tar.gz", hash = "sha256:5cd1ca9ecb5eed63cb8d6ce5e9c438ca13af4fa98e7eb6f376be541dd4990bcb", size = 7024, upload-time = "2020-08-12T15:48:20.566Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/13/ac/88c719ec04351cd1d56c5faab18e6183b3d82eb5c64a10284feeaadb6ff1/sphinx_polyversion-2.0.0-py3-none-any.whl", hash = "sha256:5455e8a5560d587a0401009196612c98b067138e69556891eb9b8edc547e1327", size = 36628, upload-time = "2025-09-25T15:15:11.939Z" }, ] @@ -2298,51 +2307,14 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/41/52/e465037f5375f43533d1a80b6923955201596a99142ed524d77b571a1418/wcwidth-0.7.0-py3-none-any.whl", hash = "sha256:5d69154c429a82910e241c738cd0e2976fac8a2dd47a1a805f4afed1c0f136f2", size = 110825, upload-time = "2026-05-02T16:04:11.033Z" }, ] -[[package]] -name = "xgboost" -version = "2.1.4" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.10'", -] -dependencies = [ - { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "nvidia-nccl-cu12", marker = "python_full_version < '3.10' and platform_machine != 'aarch64' and sys_platform == 'linux'" }, - { name = "scipy", version = "1.13.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/e2/5e/860a1ef13ce38db8c257c83e138be64bcffde8f401e84bf1e2e91838afa3/xgboost-2.1.4.tar.gz", hash = "sha256:ab84c4bbedd7fae1a26f61e9dd7897421d5b08454b51c6eb072abc1d346d08d7", size = 1091127, upload-time = "2025-02-06T18:18:20.192Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/b6/fe/7a1d2342c2e93f22b41515e02b73504c7809247b16ae395bd2ee7ef11e19/xgboost-2.1.4-py3-none-macosx_10_15_x86_64.macosx_11_0_x86_64.macosx_12_0_x86_64.whl", hash = "sha256:78d88da184562deff25c820d943420342014dd55e0f4c017cc4563c2148df5ee", size = 2140692, upload-time = "2025-02-06T18:16:59.23Z" }, - { url = "https://files.pythonhosted.org/packages/f5/b6/653a70910739f127adffbefb688ebc22b51139292757de7c22b1e04ce792/xgboost-2.1.4-py3-none-macosx_12_0_arm64.whl", hash = "sha256:523db01d4e74b05c61a985028bde88a4dd380eadc97209310621996d7d5d14a7", size = 1939418, upload-time = "2025-02-06T18:17:02.494Z" }, - { url = "https://files.pythonhosted.org/packages/43/06/905fee34c10fb0d0c3baa15106413b76f360d8e958765ec57c9eddf762fa/xgboost-2.1.4-py3-none-manylinux2014_aarch64.whl", hash = "sha256:57c7e98111aceef4b689d7d2ce738564a1f7fe44237136837a47847b8b33bade", size = 4442052, upload-time = "2025-02-06T18:17:04.029Z" }, - { url = "https://files.pythonhosted.org/packages/f8/6a/41956f91ab984f2fa44529b2551d825a20d33807eba051a60d06ede2a87c/xgboost-2.1.4-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f1343a512e634822eab30d300bfc00bf777dc869d881cc74854b42173cfcdb14", size = 4533170, upload-time = "2025-02-06T18:17:05.753Z" }, - { url = "https://files.pythonhosted.org/packages/b1/53/37032dca20dae7a88ad1907f817a81f232ca6e935f0c28c98db3c0a0bd22/xgboost-2.1.4-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:d366097d0db047315736f46af852feaa907f6d7371716af741cdce488ae36d20", size = 4206715, upload-time = "2025-02-06T18:17:08.448Z" }, - { url = "https://files.pythonhosted.org/packages/e4/3c/e3a93bfa7e8693c825df5ec02a40f7ff5f0950e02198b1e85da9315a8d47/xgboost-2.1.4-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:8df6da72963969ab2bf49a520c3e147b1e15cbeddd3aa0e3e039b3532c739339", size = 223642416, upload-time = "2025-02-06T18:17:25.08Z" }, - { url = "https://files.pythonhosted.org/packages/43/80/0b5a2dfcf5b4da27b0b68d2833f05d77e1a374d43db951fca200a1f12a52/xgboost-2.1.4-py3-none-win_amd64.whl", hash = "sha256:8bbfe4fedc151b83a52edbf0de945fd94358b09a81998f2945ad330fd5f20cd6", size = 124910381, upload-time = "2025-02-06T18:17:43.202Z" }, -] - [[package]] name = "xgboost" version = "3.2.0" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.14' and sys_platform == 'win32'", - "python_full_version >= '3.14' and sys_platform == 'emscripten'", - "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32'", - "python_full_version >= '3.12' and python_full_version < '3.14' and sys_platform == 'win32'", - "python_full_version >= '3.12' and python_full_version < '3.14' and sys_platform == 'emscripten'", - "python_full_version >= '3.12' and python_full_version < '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32'", - "python_full_version == '3.11.*' and sys_platform == 'win32'", - "python_full_version == '3.11.*' and sys_platform == 'emscripten'", - "python_full_version == '3.11.*' and sys_platform != 'emscripten' and sys_platform != 'win32'", - "python_full_version == '3.10.*'", -] dependencies = [ - { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" }, - { name = "numpy", version = "2.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, - { name = "nvidia-nccl-cu12", marker = "python_full_version >= '3.10' and sys_platform == 'linux'" }, - { name = "scipy", version = "1.15.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" }, - { name = "scipy", version = "1.17.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "numpy" }, + { name = "nvidia-nccl-cu12", marker = "sys_platform == 'linux'" }, + { name = "scipy" }, ] sdist = { url = "https://files.pythonhosted.org/packages/91/bb/1eb0242409d22db725d7a88088e6cfd6556829fb0736f9ff69aa9f1e9455/xgboost-3.2.0.tar.gz", hash = "sha256:99b0e9a2a64896cdaf509c5e46372d336c692406646d20f2af505003c0c5d70d", size = 1263936, upload-time = "2026-02-10T11:03:05.542Z" } wheels = [ From 8350bd2432c4d0d816ceeca6d03c1a32b53638ac Mon Sep 17 00:00:00 2001 From: Fede Raimondo Date: Wed, 27 May 2026 15:17:19 +0300 Subject: [PATCH 06/24] Newer version of xgboost required (python compat) --- pyproject.toml | 5 ++--- uv.lock | 4 ++-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 85f95339b..d96ff6071 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -81,8 +81,7 @@ optuna = [ "optuna_integration>=4.0.0,<5.0.0", ] -xgboost = ["xgboost>=2.1.4,<4.0.0"] - +xgboost = ["xgboost>=3.0.0,<4.0.0"] # Everything we need for docs is here docs = ["julearn[sphinx,viz,optuna,skopt]"] @@ -305,7 +304,7 @@ optional = [ "optuna>=4.0.0,<5.0.0", "optuna-integration>=4.0.0,<5.0.0", "scikit-optimize>=0.10.2,<0.11.0", - "xgboost>=2.1.4,<4.0.0", + "xgboost>=3.0.0,<4.0.0", ] viz = [ "bokeh>=3.4.3", diff --git a/uv.lock b/uv.lock index 77dc9e139..ef8d16558 100644 --- a/uv.lock +++ b/uv.lock @@ -726,7 +726,7 @@ requires-dist = [ { name = "towncrier", marker = "extra == 'sphinx'", specifier = ">=25.8.0" }, { name = "tox", marker = "extra == 'dev'" }, { name = "xgboost", specifier = ">=2.1.4" }, - { name = "xgboost", marker = "extra == 'xgboost'", specifier = ">=2.1.4,<4.0.0" }, + { name = "xgboost", marker = "extra == 'xgboost'", specifier = ">=3.0.0,<4.0.0" }, ] provides-extras = ["dev", "sphinx", "deslib", "viz", "skopt", "optuna", "xgboos", "docs", "all"] @@ -759,7 +759,7 @@ optional = [ { name = "optuna", specifier = ">=4.0.0,<5.0.0" }, { name = "optuna-integration", specifier = ">=4.0.0,<5.0.0" }, { name = "scikit-optimize", specifier = ">=0.10.2,<0.11.0" }, - { name = "xgboost", specifier = ">=2.1.4,<4.0.0" }, + { name = "xgboost", specifier = ">=3.0.0,<4.0.0" }, ] viz = [ { name = "bokeh", specifier = ">=3.4.3" }, From f19f70151d470bc9f1c5baeee730c171cfdd5e9c Mon Sep 17 00:00:00 2001 From: Fede Raimondo Date: Wed, 27 May 2026 15:38:06 +0300 Subject: [PATCH 07/24] Fix for multiclass --- .../models/tests/test_xgb_cvearlystopping.py | 24 +++++++++++++++++++ julearn/models/xgb_cvearlystopping.py | 17 +++++++------ 2 files changed, 32 insertions(+), 9 deletions(-) diff --git a/julearn/models/tests/test_xgb_cvearlystopping.py b/julearn/models/tests/test_xgb_cvearlystopping.py index 0cc25379e..a144cd7f5 100644 --- a/julearn/models/tests/test_xgb_cvearlystopping.py +++ b/julearn/models/tests/test_xgb_cvearlystopping.py @@ -116,3 +116,27 @@ def test_XGBClassifierCVEarlyStopping_grouped(df_iris) -> None: assert _is_fitted(model) assert hasattr(model, "_grouped_cv") assert model._grouped_cv is True + + +def test_XGBClassifierCVEarlyStopping_binary(df_binary) -> None: + """Test XGBClassifierCVEarlyStopping with binary classification. + + Parameters + ---------- + df_binary : pd.DataFrame + The binary classification dataset as a DataFrame. + + """ + X = ["sepal_length", "sepal_width", "petal_width"] + y = "species" + + model = XGBClassifierCVEarlyStopping( + test_size=0.2, early_stopping_rounds=5, random_state=42 + ) + + assert _is_fitted(model) is False + assert not hasattr(model, "_grouped_cv") + model.fit(df_binary[X], df_binary[y]) + assert _is_fitted(model) + assert hasattr(model, "_grouped_cv") + assert model._grouped_cv is False diff --git a/julearn/models/xgb_cvearlystopping.py b/julearn/models/xgb_cvearlystopping.py index 267c75398..22a9ad6dc 100644 --- a/julearn/models/xgb_cvearlystopping.py +++ b/julearn/models/xgb_cvearlystopping.py @@ -124,15 +124,14 @@ def fit( # Create a model with the max iterations set as the best epochs and # refit on full data t_kwargs = self._xgboost_kwargs.copy() - if hasattr(model, "best_ntree_limit"): - t_kwargs["n_estimators"] = model.best_ntree_limit - else: - num_parallel_tree = model.get_params().get("num_parallel_tree") - if num_parallel_tree is None: - num_parallel_tree = 1 - t_kwargs["n_estimators"] = ( - model.best_iteration + 1 - ) * num_parallel_tree + + num_parallel_tree = model.get_params().get("num_parallel_tree") + if num_parallel_tree is None: + num_parallel_tree = 1 + n_classes = model.get_params().get("n_classes_", 1) + t_kwargs["n_estimators"] = ( + (model.best_iteration + 1) * num_parallel_tree * n_classes + ) model = self.base_estimator(**t_kwargs) model.fit(X=X, y=y) self._model = model From f346c073ebb374305e7c7b0876eea69f3e7b95da Mon Sep 17 00:00:00 2001 From: Fede Raimondo Date: Wed, 27 May 2026 15:41:40 +0300 Subject: [PATCH 08/24] More tests --- .../models/tests/test_xgb_cvearlystopping.py | 22 ++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/julearn/models/tests/test_xgb_cvearlystopping.py b/julearn/models/tests/test_xgb_cvearlystopping.py index a144cd7f5..ee0219f02 100644 --- a/julearn/models/tests/test_xgb_cvearlystopping.py +++ b/julearn/models/tests/test_xgb_cvearlystopping.py @@ -4,6 +4,7 @@ # License: AGPL import pandas as pd +import pytest from sklearn.utils.validation import _is_fitted from julearn.models.xgb_cvearlystopping import ( @@ -116,6 +117,9 @@ def test_XGBClassifierCVEarlyStopping_grouped(df_iris) -> None: assert _is_fitted(model) assert hasattr(model, "_grouped_cv") assert model._grouped_cv is True + assert model.get_params()["test_size"] == 0.2 + assert model.get_params()["early_stopping_rounds"] == 5 + assert model.get_params()["random_state"] == 42 def test_XGBClassifierCVEarlyStopping_binary(df_binary) -> None: @@ -131,7 +135,7 @@ def test_XGBClassifierCVEarlyStopping_binary(df_binary) -> None: y = "species" model = XGBClassifierCVEarlyStopping( - test_size=0.2, early_stopping_rounds=5, random_state=42 + test_size=0.2, early_stopping_rounds=5 ) assert _is_fitted(model) is False @@ -140,3 +144,19 @@ def test_XGBClassifierCVEarlyStopping_binary(df_binary) -> None: assert _is_fitted(model) assert hasattr(model, "_grouped_cv") assert model._grouped_cv is False + assert model.get_params()["test_size"] == 0.2 + assert model.get_params()["early_stopping_rounds"] == 5 + + +def test_XGBClassifierCVEarlyStopping_errors() -> None: + """Test XGBClassifierCVEarlyStopping error handling.""" + with pytest.raises(ValueError, match="early_stopping_rounds"): + model = XGBClassifierCVEarlyStopping( + test_size=0.2, early_stopping_rounds=None, random_state=42 + ) + + with pytest.raises(ValueError, match="not fitted"): + model = XGBClassifierCVEarlyStopping( + test_size=None, early_stopping_rounds=5, random_state=42 + ) + model.predict([[1, 2], [3, 4], [5, 6]]) From 1eb5cb521ed6fde21594cca0a75fabed135fa1a1 Mon Sep 17 00:00:00 2001 From: Fede Raimondo Date: Wed, 27 May 2026 19:06:31 +0300 Subject: [PATCH 09/24] XGB Early Stopping conditional to xgboost --- julearn/models/available_models.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/julearn/models/available_models.py b/julearn/models/available_models.py index 97fdaf016..6ff070cda 100644 --- a/julearn/models/available_models.py +++ b/julearn/models/available_models.py @@ -50,6 +50,11 @@ try: # pragma: no cover from xgboost import XGBClassifier, XGBRegressor + from .xgb_cvearlystopping import ( + XGBClassifierCVEarlyStopping, + XGBRegressorCVEarlyStopping, + ) + _has_xgboost = True except ImportError: _has_xgboost = False @@ -58,10 +63,6 @@ from ..utils.logging import DelayedFmtMessage as __ from ..utils.typing import ModelLike from .dynamic import DynamicSelection -from .xgb_cvearlystopping import ( - XGBClassifierCVEarlyStopping, - XGBRegressorCVEarlyStopping, -) _available_models: dict[str, dict[str, Any]] = { From 75e2520b34ea28e1c348df9968d1a5a2966075d8 Mon Sep 17 00:00:00 2001 From: Fede Raimondo Date: Wed, 27 May 2026 19:20:34 +0300 Subject: [PATCH 10/24] Fix docs --- docs/available_pipeline_steps.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/available_pipeline_steps.rst b/docs/available_pipeline_steps.rst index 089d3a4ac..e6b0737ef 100644 --- a/docs/available_pipeline_steps.rst +++ b/docs/available_pipeline_steps.rst @@ -242,7 +242,7 @@ Ensemble - Y - Y - Y - * - ``xgb_cvearlystopping`` + * - ``xgb_cvearlystopping`` - XGBoost with Cross-Validation and Early Stopping - | :class:`~julearn.models.xgb_cvearlystopping.XGBClassifierCVEarlyStopping` and | :class:`~julearn.models.xgb_cvearlystopping.XGBRegressorCVEarlyStopping` From e34ab264495f717caedb239182710aebae9db28c Mon Sep 17 00:00:00 2001 From: Fede Raimondo Date: Wed, 27 May 2026 19:21:35 +0300 Subject: [PATCH 11/24] Add xgboost intersphinx --- docs/conf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/conf.py b/docs/conf.py index cab9cfe9c..4eb525573 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -231,6 +231,7 @@ None, ), "panel": ("https://panel.holoviz.org/", None), + "xgboost": ("https://xgboost.readthedocs.io/en/stable/", None), } # -- sphinx.ext.extlinks configuration --------------------------------------- From e59f6b7b4b0c637ccdd887e5f46fe641fc411c33 Mon Sep 17 00:00:00 2001 From: Fede Raimondo Date: Wed, 27 May 2026 21:41:50 +0300 Subject: [PATCH 12/24] Add documentation for custom models --- docs/api/models.rst | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/docs/api/models.rst b/docs/api/models.rst index 7fcb9c2e2..d65374cb6 100644 --- a/docs/api/models.rst +++ b/docs/api/models.rst @@ -19,6 +19,20 @@ Functions register_model reset_model_register +Julearn custom models +--------------------- + +This is a list of models implemented by Julearn that are not simple wrappers +around existing models in other libraries but rather variants of existing +models or novel models. + +.. autosummary:: + :toctree: generated/ + :template: class.rst + + xgb_cvearlystopping.XGBClassifierCVEarlyStopping + xgb_cvearlystopping.XGBRegressorCVEarlyStopping + Dynamic Selection (DESLib) ========================== From 3a006ab6a914d7373eef524d76d9d9f389a8e4b8 Mon Sep 17 00:00:00 2001 From: Fede Raimondo Date: Wed, 27 May 2026 21:43:37 +0300 Subject: [PATCH 13/24] Add doc for XGB CV Early stopping --- docs/api/models.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/api/models.rst b/docs/api/models.rst index d65374cb6..0556012e7 100644 --- a/docs/api/models.rst +++ b/docs/api/models.rst @@ -27,6 +27,7 @@ around existing models in other libraries but rather variants of existing models or novel models. .. autosummary:: + :nosignatures: :toctree: generated/ :template: class.rst From a395752638a7f406c795d620d57a8c75c4f88e75 Mon Sep 17 00:00:00 2001 From: Fede Raimondo Date: Wed, 27 May 2026 22:34:17 +0300 Subject: [PATCH 14/24] More tests --- .../models/tests/test_xgb_cvearlystopping.py | 204 ++++++++++++++++++ julearn/models/xgb_cvearlystopping.py | 5 +- 2 files changed, 207 insertions(+), 2 deletions(-) diff --git a/julearn/models/tests/test_xgb_cvearlystopping.py b/julearn/models/tests/test_xgb_cvearlystopping.py index ee0219f02..281290d5f 100644 --- a/julearn/models/tests/test_xgb_cvearlystopping.py +++ b/julearn/models/tests/test_xgb_cvearlystopping.py @@ -40,6 +40,21 @@ def test_XGBRegressorCVEarlyStopping_grouped(df_iris) -> None: assert _is_fitted(model) assert hasattr(model, "_grouped_cv") assert model._grouped_cv is True + assert model._model.get_params()["num_parallel_tree"] is None + + # Check that the model was refit with the best number of iterations + assert model._model.get_params()["early_stopping_rounds"] is None + assert model._model.get_params()["random_state"] == 42 + assert model._best_iteration is not None + assert ( + model._model.get_params()["n_estimators"] == model._best_iteration + 1 + ) + + y_pred = model.predict(df_iris[X]) + assert y_pred.shape == (len(df_iris),) + + score = model.score(df_iris[X], df_iris[y]) + assert isinstance(score, float) def test_XGBRegressorCVEarlyStopping_notgrouped(df_iris) -> None: @@ -64,6 +79,52 @@ def test_XGBRegressorCVEarlyStopping_notgrouped(df_iris) -> None: assert _is_fitted(model) assert hasattr(model, "_grouped_cv") assert model._grouped_cv is False + assert model._model.get_params()["num_parallel_tree"] is None + + # Check that the model was refit with the best number of iterations + assert model._model.get_params()["early_stopping_rounds"] is None + assert model._model.get_params()["random_state"] == 42 + assert model._best_iteration is not None + assert ( + model._model.get_params()["n_estimators"] == model._best_iteration + 1 + ) + + +def test_XGBRegressorCVEarlyStopping_numpy(df_iris) -> None: + """Test XGBRegressorCVEarlyStopping with numpy data. + + Parameters + ---------- + df_iris : pd.DataFrame + The iris dataset as a DataFrame. + + """ + X = ["sepal_length", "sepal_width", "petal_width"] + y = "petal_length" + + model = XGBRegressorCVEarlyStopping( + test_size=0.2, + early_stopping_rounds=5, + random_state=42, + num_parallel_tree=2, + ) + + assert _is_fitted(model) is False + assert not hasattr(model, "_grouped_cv") + model.fit(df_iris[X].values, df_iris[y].values) + assert _is_fitted(model) + assert hasattr(model, "_grouped_cv") + assert model._grouped_cv is False + + # Check that the model was refit with the best number of iterations + assert model._model.get_params()["early_stopping_rounds"] is None + assert model._model.get_params()["random_state"] == 42 + assert model._best_iteration is not None + assert model._model.get_params()["num_parallel_tree"] == 2 + assert ( + model._model.get_params()["n_estimators"] + == (model._best_iteration + 1) * 2 + ) def test_XGBClassifierCVEarlyStopping_notgrouped(df_iris) -> None: @@ -88,6 +149,18 @@ def test_XGBClassifierCVEarlyStopping_notgrouped(df_iris) -> None: assert _is_fitted(model) assert hasattr(model, "_grouped_cv") assert model._grouped_cv is False + assert model._model.get_params()["num_parallel_tree"] is None + + # Check that the model was refit with the best number of iterations + assert model._model.get_params()["early_stopping_rounds"] is None + assert model._model.get_params()["random_state"] == 42 + assert model._best_iteration is not None + + # Three classes, so the number of trees is the best iteration times 3 + assert ( + model._model.get_params()["n_estimators"] + == (model._best_iteration + 1) * 3 + ) def test_XGBClassifierCVEarlyStopping_grouped(df_iris) -> None: @@ -121,6 +194,28 @@ def test_XGBClassifierCVEarlyStopping_grouped(df_iris) -> None: assert model.get_params()["early_stopping_rounds"] == 5 assert model.get_params()["random_state"] == 42 + # Check that the model was refit with the best number of iterations + assert model._model.get_params()["early_stopping_rounds"] is None + assert model._model.get_params()["random_state"] == 42 + assert model._best_iteration is not None + + # Three classes, so the number of trees is the best iteration times 3 + assert ( + model._model.get_params()["n_estimators"] + == (model._best_iteration + 1) * 3 + ) + + y_pred = model.predict(df_iris[X]) + assert y_pred.shape == (len(df_iris),) + assert set(y_pred).issubset(set(df_iris[y])) + + y_probas = model.predict_proba(df_iris[X]) + assert y_probas.shape == (len(df_iris), 3) + assert (y_probas >= 0).all() and (y_probas <= 1).all() + + score = model.score(df_iris[X], df_iris[y]) + assert isinstance(score, float) + def test_XGBClassifierCVEarlyStopping_binary(df_binary) -> None: """Test XGBClassifierCVEarlyStopping with binary classification. @@ -146,6 +241,16 @@ def test_XGBClassifierCVEarlyStopping_binary(df_binary) -> None: assert model._grouped_cv is False assert model.get_params()["test_size"] == 0.2 assert model.get_params()["early_stopping_rounds"] == 5 + # Check that the model was refit with the best number of iterations + assert model._model.get_params()["early_stopping_rounds"] is None + assert model._model.get_params()["random_state"] is None + assert model._best_iteration is not None + + # Two classes, so the number of trees is the best iteration times 2 + assert ( + model._model.get_params()["n_estimators"] + == (model._best_iteration + 1) * 2 + ) def test_XGBClassifierCVEarlyStopping_errors() -> None: @@ -160,3 +265,102 @@ def test_XGBClassifierCVEarlyStopping_errors() -> None: test_size=None, early_stopping_rounds=5, random_state=42 ) model.predict([[1, 2], [3, 4], [5, 6]]) + + +def test_XGBClassifierCVEarlyStopping_numpy(df_iris) -> None: + """Test XGBClassifierCVEarlyStopping with numpy data. + + Parameters + ---------- + df_iris : pd.DataFrame + The iris dataset as a DataFrame. + + """ + X = ["sepal_length", "sepal_width", "petal_width"] + y = "species" + + model = XGBClassifierCVEarlyStopping( + test_size=0.2, early_stopping_rounds=5, random_state=42 + ) + + assert _is_fitted(model) is False + assert not hasattr(model, "_grouped_cv") + model.fit(df_iris[X].values, (df_iris[y].values == "setosa").astype(int)) + assert _is_fitted(model) + assert hasattr(model, "_grouped_cv") + assert model._grouped_cv is False + + # Check that the model was refit with the best number of iterations + assert model._model.get_params()["early_stopping_rounds"] is None + assert model._model.get_params()["random_state"] == 42 + assert model._best_iteration is not None + + # Two classes, so the number of trees is the best iteration times 2 + assert ( + model._model.get_params()["n_estimators"] + == (model._best_iteration + 1) * 2 + ) + + +def test_XGBClassifierCVEarlyStopping_set_params(df_iris) -> None: + """Test XGBClassifierCVEarlyStopping with grouped data. + + Parameters + ---------- + df_iris : pd.DataFrame + The iris dataset as a DataFrame. + + """ + X = ["sepal_length", "sepal_width", "petal_width"] + y = "species" + n_groups = 20 + bins = pd.cut( + df_iris.index.values, labels=list(range(n_groups)), bins=n_groups + ) + df_iris["group"] = bins.astype(int) + + model = XGBClassifierCVEarlyStopping( + test_size=0.2, early_stopping_rounds=5, random_state=42 + ) + + assert _is_fitted(model) is False + assert not hasattr(model, "_grouped_cv") + model.fit(df_iris[X], df_iris[y], groups=df_iris["group"]) + assert _is_fitted(model) + assert hasattr(model, "_grouped_cv") + assert model._grouped_cv is True + assert model.get_params()["test_size"] == 0.2 + assert model.get_params()["early_stopping_rounds"] == 5 + assert model.get_params()["random_state"] == 42 + + # Check that the model was refit with the best number of iterations + assert model._model.get_params()["early_stopping_rounds"] is None + assert model._model.get_params()["random_state"] == 42 + assert model._best_iteration is not None + + # Three classes, so the number of trees is the best iteration times 3 + assert ( + model._model.get_params()["n_estimators"] + == (model._best_iteration + 1) * 3 + ) + + model.set_params(test_size=0.3, early_stopping_rounds=10, random_state=24) + assert model.get_params()["test_size"] == 0.3 + assert model.get_params()["early_stopping_rounds"] == 10 + assert model.get_params()["random_state"] == 24 + model.fit(df_iris[X], df_iris[y], groups=df_iris["group"]) + assert _is_fitted(model) + assert hasattr(model, "_grouped_cv") + assert model._grouped_cv is True + assert model.get_params()["test_size"] == 0.3 + assert model.get_params()["early_stopping_rounds"] == 10 + assert model.get_params()["random_state"] == 24 + # Check that the model was refit with the best number of iterations + assert model._model.get_params()["early_stopping_rounds"] is None + assert model._model.get_params()["random_state"] == 24 + assert model._best_iteration is not None + # Three classes, so the number of trees is the best iteration times 3 + assert ( + model._model.get_params()["n_estimators"] + == (model._best_iteration + 1) * 3 + ) diff --git a/julearn/models/xgb_cvearlystopping.py b/julearn/models/xgb_cvearlystopping.py index 22a9ad6dc..6aca8f402 100644 --- a/julearn/models/xgb_cvearlystopping.py +++ b/julearn/models/xgb_cvearlystopping.py @@ -124,13 +124,14 @@ def fit( # Create a model with the max iterations set as the best epochs and # refit on full data t_kwargs = self._xgboost_kwargs.copy() + self._best_iteration = model.best_iteration num_parallel_tree = model.get_params().get("num_parallel_tree") if num_parallel_tree is None: num_parallel_tree = 1 - n_classes = model.get_params().get("n_classes_", 1) + n_classes = getattr(model, "n_classes_", 1) t_kwargs["n_estimators"] = ( - (model.best_iteration + 1) * num_parallel_tree * n_classes + (self._best_iteration + 1) * num_parallel_tree * n_classes ) model = self.base_estimator(**t_kwargs) model.fit(X=X, y=y) From 6ea304393059649020c4de0c2534e667edaa5e05 Mon Sep 17 00:00:00 2001 From: Fede Raimondo Date: Wed, 27 May 2026 23:10:35 +0300 Subject: [PATCH 15/24] More tests --- .../models/tests/test_xgb_cvearlystopping.py | 54 +++++++++++++++++-- julearn/models/xgb_cvearlystopping.py | 27 ++-------- 2 files changed, 52 insertions(+), 29 deletions(-) diff --git a/julearn/models/tests/test_xgb_cvearlystopping.py b/julearn/models/tests/test_xgb_cvearlystopping.py index 281290d5f..9e4382f0f 100644 --- a/julearn/models/tests/test_xgb_cvearlystopping.py +++ b/julearn/models/tests/test_xgb_cvearlystopping.py @@ -251,6 +251,16 @@ def test_XGBClassifierCVEarlyStopping_binary(df_binary) -> None: model._model.get_params()["n_estimators"] == (model._best_iteration + 1) * 2 ) + y_pred = model.predict(df_binary[X]) + assert y_pred.shape == (len(df_binary),) + assert set(y_pred).issubset(set(df_binary[y])) + + y_probas = model.predict_proba(df_binary[X]) + assert y_probas.shape == (len(df_binary), 2) + assert (y_probas >= 0).all() and (y_probas <= 1).all() + + score = model.score(df_binary[X], df_binary[y]) + assert isinstance(score, float) def test_XGBClassifierCVEarlyStopping_errors() -> None: @@ -285,7 +295,7 @@ def test_XGBClassifierCVEarlyStopping_numpy(df_iris) -> None: assert _is_fitted(model) is False assert not hasattr(model, "_grouped_cv") - model.fit(df_iris[X].values, (df_iris[y].values == "setosa").astype(int)) + model.fit(df_iris[X].values, df_iris[y].values) assert _is_fitted(model) assert hasattr(model, "_grouped_cv") assert model._grouped_cv is False @@ -295,12 +305,39 @@ def test_XGBClassifierCVEarlyStopping_numpy(df_iris) -> None: assert model._model.get_params()["random_state"] == 42 assert model._best_iteration is not None - # Two classes, so the number of trees is the best iteration times 2 + # Three classes, so the number of trees is the best iteration times 3 assert ( model._model.get_params()["n_estimators"] - == (model._best_iteration + 1) * 2 + == (model._best_iteration + 1) * 3 + ) + + model.fit(df_iris[X].values, df_iris[y].values.to_numpy()) + assert _is_fitted(model) + assert hasattr(model, "_grouped_cv") + assert model._grouped_cv is False + + # Check that the model was refit with the best number of iterations + assert model._model.get_params()["early_stopping_rounds"] is None + assert model._model.get_params()["random_state"] == 42 + assert model._best_iteration is not None + + # Three classes, so the number of trees is the best iteration times 3 + assert ( + model._model.get_params()["n_estimators"] + == (model._best_iteration + 1) * 3 ) + y_pred = model.predict(df_iris[X]) + assert y_pred.shape == (len(df_iris),) + assert set(y_pred).issubset(set(df_iris[y])) + + y_probas = model.predict_proba(df_iris[X]) + assert y_probas.shape == (len(df_iris), 3) + assert (y_probas >= 0).all() and (y_probas <= 1).all() + + score = model.score(df_iris[X], df_iris[y]) + assert isinstance(score, float) + def test_XGBClassifierCVEarlyStopping_set_params(df_iris) -> None: """Test XGBClassifierCVEarlyStopping with grouped data. @@ -344,10 +381,16 @@ def test_XGBClassifierCVEarlyStopping_set_params(df_iris) -> None: == (model._best_iteration + 1) * 3 ) - model.set_params(test_size=0.3, early_stopping_rounds=10, random_state=24) + model.set_params( + test_size=0.3, + early_stopping_rounds=10, + random_state=24, + num_parallel_tree=2, + ) assert model.get_params()["test_size"] == 0.3 assert model.get_params()["early_stopping_rounds"] == 10 assert model.get_params()["random_state"] == 24 + assert model.get_params()["num_parallel_tree"] == 2 model.fit(df_iris[X], df_iris[y], groups=df_iris["group"]) assert _is_fitted(model) assert hasattr(model, "_grouped_cv") @@ -355,6 +398,7 @@ def test_XGBClassifierCVEarlyStopping_set_params(df_iris) -> None: assert model.get_params()["test_size"] == 0.3 assert model.get_params()["early_stopping_rounds"] == 10 assert model.get_params()["random_state"] == 24 + assert model.get_params()["num_parallel_tree"] == 2 # Check that the model was refit with the best number of iterations assert model._model.get_params()["early_stopping_rounds"] is None assert model._model.get_params()["random_state"] == 24 @@ -362,5 +406,5 @@ def test_XGBClassifierCVEarlyStopping_set_params(df_iris) -> None: # Three classes, so the number of trees is the best iteration times 3 assert ( model._model.get_params()["n_estimators"] - == (model._best_iteration + 1) * 3 + == (model._best_iteration + 1) * 3 * 2 ) diff --git a/julearn/models/xgb_cvearlystopping.py b/julearn/models/xgb_cvearlystopping.py index 6aca8f402..b819bd23c 100644 --- a/julearn/models/xgb_cvearlystopping.py +++ b/julearn/models/xgb_cvearlystopping.py @@ -311,10 +311,9 @@ def fit( self._label_encoder = None # Check if labels are strings and convert to integers if so, to avoid # issues with XGBoost - if isinstance(y, pd.Series) and y.dtype in ["object", "string"]: - self._label_encoder = LabelEncoder() - y = self._label_encoder.fit_transform(y) # type: ignore - elif isinstance(y, np.ndarray) and y.dtype == "object": + if isinstance( + y, pd.Series | np.ndarray | pd.arrays.StringArray + ) and y.dtype in ["object", "string"]: self._label_encoder = LabelEncoder() y = self._label_encoder.fit_transform(y) # type: ignore super().fit(X, y, groups) @@ -357,23 +356,3 @@ def predict_proba(self, X: DataLike) -> DataLike: # noqa: N803 if self._model is None: raise ValueError("Model not fitted") return self._model.predict_proba(X) - - def score(self, X: DataLike, y: DataLike) -> float: # noqa: N803 - """Score the model. - - Parameters - ---------- - X : pd.DataFrame - The data to predict on. - y : DataLike - The true target values. - - Returns - ------- - float - The score. - - """ - if self._label_encoder is not None: - y = self._label_encoder.transform(y) - return super().score(X, y) From 3049db876123cd22e6dce6c30aa1f6d62a43286f Mon Sep 17 00:00:00 2001 From: Fede Raimondo Date: Wed, 27 May 2026 23:18:10 +0300 Subject: [PATCH 16/24] Consistency with numpy tests --- julearn/models/tests/test_xgb_cvearlystopping.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/julearn/models/tests/test_xgb_cvearlystopping.py b/julearn/models/tests/test_xgb_cvearlystopping.py index 9e4382f0f..f9e550a28 100644 --- a/julearn/models/tests/test_xgb_cvearlystopping.py +++ b/julearn/models/tests/test_xgb_cvearlystopping.py @@ -327,15 +327,15 @@ def test_XGBClassifierCVEarlyStopping_numpy(df_iris) -> None: == (model._best_iteration + 1) * 3 ) - y_pred = model.predict(df_iris[X]) + y_pred = model.predict(df_iris[X].values) assert y_pred.shape == (len(df_iris),) assert set(y_pred).issubset(set(df_iris[y])) - y_probas = model.predict_proba(df_iris[X]) + y_probas = model.predict_proba(df_iris[X].values) assert y_probas.shape == (len(df_iris), 3) assert (y_probas >= 0).all() and (y_probas <= 1).all() - score = model.score(df_iris[X], df_iris[y]) + score = model.score(df_iris[X].values, df_iris[y].values) assert isinstance(score, float) From 87c05466fc4c3cf7cff9f387c625b42951c411e1 Mon Sep 17 00:00:00 2001 From: Fede Raimondo Date: Thu, 28 May 2026 09:01:38 +0300 Subject: [PATCH 17/24] One extra case --- .../models/tests/test_xgb_cvearlystopping.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/julearn/models/tests/test_xgb_cvearlystopping.py b/julearn/models/tests/test_xgb_cvearlystopping.py index f9e550a28..5473c1053 100644 --- a/julearn/models/tests/test_xgb_cvearlystopping.py +++ b/julearn/models/tests/test_xgb_cvearlystopping.py @@ -276,6 +276,12 @@ def test_XGBClassifierCVEarlyStopping_errors() -> None: ) model.predict([[1, 2], [3, 4], [5, 6]]) + with pytest.raises(ValueError, match="not fitted"): + model = XGBClassifierCVEarlyStopping( + test_size=None, early_stopping_rounds=5, random_state=42 + ) + model.predict_proba([[1, 2], [3, 4], [5, 6]]) + def test_XGBClassifierCVEarlyStopping_numpy(df_iris) -> None: """Test XGBClassifierCVEarlyStopping with numpy data. @@ -311,7 +317,9 @@ def test_XGBClassifierCVEarlyStopping_numpy(df_iris) -> None: == (model._best_iteration + 1) * 3 ) - model.fit(df_iris[X].values, df_iris[y].values.to_numpy()) + y_nostring = df_iris[y].values.to_numpy() == "setosa" + + model.fit(df_iris[X].values, y_nostring) assert _is_fitted(model) assert hasattr(model, "_grouped_cv") assert model._grouped_cv is False @@ -324,18 +332,18 @@ def test_XGBClassifierCVEarlyStopping_numpy(df_iris) -> None: # Three classes, so the number of trees is the best iteration times 3 assert ( model._model.get_params()["n_estimators"] - == (model._best_iteration + 1) * 3 + == (model._best_iteration + 1) * 2 ) y_pred = model.predict(df_iris[X].values) assert y_pred.shape == (len(df_iris),) - assert set(y_pred).issubset(set(df_iris[y])) + assert set(y_pred).issubset(set(y_nostring)) y_probas = model.predict_proba(df_iris[X].values) - assert y_probas.shape == (len(df_iris), 3) + assert y_probas.shape == (len(df_iris), 2) assert (y_probas >= 0).all() and (y_probas <= 1).all() - score = model.score(df_iris[X].values, df_iris[y].values) + score = model.score(df_iris[X].values, y_nostring) assert isinstance(score, float) From cac4f1b18a416b62a7aef60dad3b14a371bb46d0 Mon Sep 17 00:00:00 2001 From: Fede Raimondo Date: Sat, 30 May 2026 11:27:28 +0300 Subject: [PATCH 18/24] Sync uv --- uv.lock | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/uv.lock b/uv.lock index ef8d16558..755e3fbb0 100644 --- a/uv.lock +++ b/uv.lock @@ -728,7 +728,7 @@ requires-dist = [ { name = "xgboost", specifier = ">=2.1.4" }, { name = "xgboost", marker = "extra == 'xgboost'", specifier = ">=3.0.0,<4.0.0" }, ] -provides-extras = ["dev", "sphinx", "deslib", "viz", "skopt", "optuna", "xgboos", "docs", "all"] +provides-extras = ["dev", "sphinx", "deslib", "viz", "skopt", "optuna", "xgboost", "docs", "all"] [package.metadata.requires-dev] dev = [ @@ -1982,10 +1982,7 @@ wheels = [ name = "sphinx-polyversion" version = "2.0.0" source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "sphinx" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/9a/10/25231164a97a9016bdc73a3530af8f4a6846bdc564af1460af2ff3e59a50/sphinx-multiversion-0.2.4.tar.gz", hash = "sha256:5cd1ca9ecb5eed63cb8d6ce5e9c438ca13af4fa98e7eb6f376be541dd4990bcb", size = 7024, upload-time = "2020-08-12T15:48:20.566Z" } +sdist = { url = "https://files.pythonhosted.org/packages/75/01/7755fc4b01ff281df937f6563c190b55313a14e9b1b5ac960003933f0793/sphinx_polyversion-2.0.0.tar.gz", hash = "sha256:ce5d15bbf5d2003aaec9e25c1646ec4f8a91cd55dac89df60ff17bd15630f926", size = 32265, upload-time = "2025-09-25T15:15:13.035Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/13/ac/88c719ec04351cd1d56c5faab18e6183b3d82eb5c64a10284feeaadb6ff1/sphinx_polyversion-2.0.0-py3-none-any.whl", hash = "sha256:5455e8a5560d587a0401009196612c98b067138e69556891eb9b8edc547e1327", size = 36628, upload-time = "2025-09-25T15:15:11.939Z" }, ] From c8857fbb511e66c88c055534f17a4259080a25e8 Mon Sep 17 00:00:00 2001 From: Fede Raimondo Date: Sat, 30 May 2026 12:07:17 +0300 Subject: [PATCH 19/24] Add tests for XGB CV numpy grouped --- .../models/tests/test_xgb_cvearlystopping.py | 58 +++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/julearn/models/tests/test_xgb_cvearlystopping.py b/julearn/models/tests/test_xgb_cvearlystopping.py index 5473c1053..1119bf38c 100644 --- a/julearn/models/tests/test_xgb_cvearlystopping.py +++ b/julearn/models/tests/test_xgb_cvearlystopping.py @@ -263,6 +263,64 @@ def test_XGBClassifierCVEarlyStopping_binary(df_binary) -> None: assert isinstance(score, float) +def test_XGBClassifierCVEarlyStopping_grouped_numpy(df_iris) -> None: + """Test XGBClassifierCVEarlyStopping with grouped data and numpy arrays. + + Parameters + ---------- + df_iris : pd.DataFrame + The iris dataset as a DataFrame. + + """ + X = ["sepal_length", "sepal_width", "petal_width"] + y = "species" + n_groups = 20 + bins = pd.cut( + df_iris.index.values, labels=list(range(n_groups)), bins=n_groups + ) + df_iris["group"] = bins.astype(int) + + model = XGBClassifierCVEarlyStopping( + test_size=0.2, early_stopping_rounds=5, random_state=42 + ) + + assert _is_fitted(model) is False + assert not hasattr(model, "_grouped_cv") + model.fit( + df_iris[X].values, + df_iris[y].values.to_numpy(), + groups=df_iris["group"].values, + ) + assert _is_fitted(model) + assert hasattr(model, "_grouped_cv") + assert model._grouped_cv is True + assert model.get_params()["test_size"] == 0.2 + assert model.get_params()["early_stopping_rounds"] == 5 + assert model.get_params()["random_state"] == 42 + + # Check that the model was refit with the best number of iterations + assert model._model.get_params()["early_stopping_rounds"] is None + assert model._model.get_params()["random_state"] == 42 + assert model._best_iteration is not None + + # Three classes, so the number of trees is the best iteration times 3 + assert ( + model._model.get_params()["n_estimators"] + == (model._best_iteration + 1) * 3 + ) + + y_pred = model.predict(df_iris[X]) + assert y_pred.shape == (len(df_iris),) + assert set(y_pred).issubset(set(df_iris[y])) + + y_probas = model.predict_proba(df_iris[X]) + assert y_probas.shape == (len(df_iris), 3) + assert (y_probas >= 0).all() and (y_probas <= 1).all() + + score = model.score(df_iris[X], df_iris[y]) + assert isinstance(score, float) + + def test_XGBClassifierCVEarlyStopping_errors() -> None: """Test XGBClassifierCVEarlyStopping error handling.""" with pytest.raises(ValueError, match="early_stopping_rounds"): From a58f78a22a4be0f0f03ce2c95b13038c8e9a95e4 Mon Sep 17 00:00:00 2001 From: Fede Raimondo Date: Sat, 30 May 2026 22:51:02 +0300 Subject: [PATCH 20/24] Remove xgboost as core-dependency --- pyproject.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index d96ff6071..37510ba76 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,7 +38,6 @@ dependencies = [ "statsmodels>=0.13,<0.15", "scikit-learn>=1.5.0,<1.9.0", "looseversion==1.3.0; python_version>='3.12'", - "xgboost>=2.1.4", ] dynamic = ["version"] From 3536cc55ea067a3b91e18d0b9f2d5f2aa257a469 Mon Sep 17 00:00:00 2001 From: Fede Raimondo Date: Sat, 30 May 2026 22:53:43 +0300 Subject: [PATCH 21/24] Fix uv.lock --- uv.lock | 2 -- 1 file changed, 2 deletions(-) diff --git a/uv.lock b/uv.lock index 755e3fbb0..610ed139e 100644 --- a/uv.lock +++ b/uv.lock @@ -587,7 +587,6 @@ dependencies = [ { name = "pandas" }, { name = "scikit-learn" }, { name = "statsmodels" }, - { name = "xgboost" }, ] [package.optional-dependencies] @@ -725,7 +724,6 @@ requires-dist = [ { name = "towncrier", marker = "extra == 'dev'", specifier = ">=25.8.0" }, { name = "towncrier", marker = "extra == 'sphinx'", specifier = ">=25.8.0" }, { name = "tox", marker = "extra == 'dev'" }, - { name = "xgboost", specifier = ">=2.1.4" }, { name = "xgboost", marker = "extra == 'xgboost'", specifier = ">=3.0.0,<4.0.0" }, ] provides-extras = ["dev", "sphinx", "deslib", "viz", "skopt", "optuna", "xgboost", "docs", "all"] From 1d84f7b49182bd129f49c325993d4247e2302a04 Mon Sep 17 00:00:00 2001 From: Fede Raimondo Date: Sat, 30 May 2026 23:07:36 +0300 Subject: [PATCH 22/24] Add xgboost to build docs --- pyproject.toml | 3 ++- uv.lock | 5 ++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 37510ba76..a4ecf7f82 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -83,7 +83,7 @@ optuna = [ xgboost = ["xgboost>=3.0.0,<4.0.0"] # Everything we need for docs is here -docs = ["julearn[sphinx,viz,optuna,skopt]"] +docs = ["julearn[sphinx,viz,optuna,skopt,xgboost]"] # Add all optional functional dependencies (skip deslib until its fixed) # This does not include dev/docs building dependencies @@ -298,6 +298,7 @@ docs = [ "sphinx-polyversion>=2.0.0", "sphinxcontrib-towncrier>=0.5.0a0", "towncrier>=25.8.0", + "xgboost>=3.0.0,<4.0.0", ] optional = [ "optuna>=4.0.0,<5.0.0", diff --git a/uv.lock b/uv.lock index 610ed139e..0fe4cea1d 100644 --- a/uv.lock +++ b/uv.lock @@ -626,6 +626,7 @@ docs = [ { name = "sphinx-polyversion" }, { name = "sphinxcontrib-towncrier" }, { name = "towncrier" }, + { name = "xgboost" }, ] optuna = [ { name = "optuna" }, @@ -680,6 +681,7 @@ docs = [ { name = "sphinx-polyversion" }, { name = "sphinxcontrib-towncrier" }, { name = "towncrier" }, + { name = "xgboost" }, ] optional = [ { name = "optuna" }, @@ -698,7 +700,7 @@ requires-dist = [ { name = "bokeh", marker = "extra == 'viz'", specifier = ">=3.0.0" }, { name = "deslib", marker = "extra == 'deslib'", specifier = ">=0.3.5,<0.4.0" }, { name = "furo", marker = "extra == 'sphinx'", specifier = ">=2025.7.19" }, - { name = "julearn", extras = ["optuna", "skopt", "sphinx", "viz"], marker = "extra == 'docs'" }, + { name = "julearn", extras = ["optuna", "skopt", "sphinx", "viz", "xgboost"], marker = "extra == 'docs'" }, { name = "julearn", extras = ["optuna", "skopt", "viz", "xgboost"], marker = "extra == 'all'" }, { name = "looseversion", marker = "python_full_version >= '3.12'", specifier = "==1.3.0" }, { name = "numpy", specifier = ">=2.3.0,<3.0.0" }, @@ -752,6 +754,7 @@ docs = [ { name = "sphinx-polyversion", specifier = ">=2.0.0" }, { name = "sphinxcontrib-towncrier", specifier = ">=0.5.0a0" }, { name = "towncrier", specifier = ">=25.8.0" }, + { name = "xgboost", specifier = ">=3.0.0,<4.0.0" }, ] optional = [ { name = "optuna", specifier = ">=4.0.0,<5.0.0" }, From 32abddf92f789404c2e8651f9943151f0ab73161 Mon Sep 17 00:00:00 2001 From: Fede Raimondo Date: Tue, 2 Jun 2026 10:47:19 +0300 Subject: [PATCH 23/24] Add xgboost dep to tox --- tox.ini | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tox.ini b/tox.ini index 07a984197..1d5e78319 100644 --- a/tox.ini +++ b/tox.ini @@ -26,6 +26,7 @@ deps = scikit-optimize>=0.10.2,<0.11 optuna>=4.0.0,<5.0.0 optuna-integration>=4.0.0,<5.0.0 + xgboost>=3.0.0,<4.0.0 commands = pytest {toxinidir}/julearn @@ -62,6 +63,7 @@ deps = scikit-optimize>=0.10.2,<0.11 optuna>=4.0.0,<5.0.0 optuna-integration>=4.0.0,<5.0.0 + xgboost>=3.0.0,<4.0.0 commands = pytest -vv {toxinidir}/julearn @@ -88,6 +90,7 @@ deps = scikit-optimize>=0.10.2,<0.11 optuna>=4.0.0,<5.0.0 optuna-integration>=4.0.0,<5.0.0 + xgboost>=3.0.0,<4.0.0 commands = pytest --cov={envsitepackagesdir}/julearn --cov=./julearn --cov-report=xml --cov-report=term --cov-config=pyproject.toml -vv From 5639bcc646023f85a067406251f2b996317eee5b Mon Sep 17 00:00:00 2001 From: Fede Raimondo Date: Tue, 2 Jun 2026 11:09:21 +0300 Subject: [PATCH 24/24] Fix docstrings for XGB CV Early stopping --- julearn/models/xgb_cvearlystopping.py | 46 ++++++++++++++++++++------- 1 file changed, 35 insertions(+), 11 deletions(-) diff --git a/julearn/models/xgb_cvearlystopping.py b/julearn/models/xgb_cvearlystopping.py index b819bd23c..7146d0a73 100644 --- a/julearn/models/xgb_cvearlystopping.py +++ b/julearn/models/xgb_cvearlystopping.py @@ -35,9 +35,17 @@ class _BaseXGBCVEarlyStopping(BaseEstimator): base_estimator : class The base XGBoost estimator class to use (e.g. XGBRegressor or XGBClassifier). - test_size : float + test_size : float or int or None The proportion of the data to use as the validation set for early - stopping. + stopping. If groups is used on `fit`, this parameter refers to the + number of groups, otherwise it refers to the number of samples. If + float, should be between 0.0 and 1.0 and represent the + proportion of the dataset to include in the test split. If int, + represents the absolute number. If None, the value is + set to the complement of the train size. If train_size is also None, + it will be set to 0.25 in the case of non-grouped data or 0.2 for + grouped data (scikit-learn's defaults for `train_test_split` and + `GroupShuffleSplit`). early_stopping_rounds : int The number of rounds to use for early stopping. **kwargs : dict @@ -48,7 +56,7 @@ class _BaseXGBCVEarlyStopping(BaseEstimator): def __init__( self, base_estimator: XGBRegressor | XGBClassifier, - test_size: float | None, + test_size: float | int | None, early_stopping_rounds: int, **kwargs, ): @@ -81,7 +89,7 @@ def fit( The data to fit the model on. y : DataLike The target data. - groups : DataLike, optional + groups : DataLike or None The group labels for the samples used while splitting the dataset into train/test set for early stopping. If None, standard train/test split is used, by default None. @@ -226,9 +234,17 @@ class XGBRegressorCVEarlyStopping(_BaseXGBCVEarlyStopping, RegressorMixin): Parameters ---------- - test_size : float + test_size : float or int or None The proportion of the data to use as the validation set for early - stopping. + stopping. If groups is used on `fit`, this parameter refers to the + number of groups, otherwise it refers to the number of samples. If + float, should be between 0.0 and 1.0 and represent the + proportion of the dataset to include in the test split. If int, + represents the absolute number. If None, the value is + set to the complement of the train size. If train_size is also None, + it will be set to 0.25 in the case of non-grouped data or 0.2 for + grouped data (scikit-learn's defaults for `train_test_split` and + `GroupShuffleSplit`). early_stopping_rounds : int The number of rounds to use for early stopping. **kwargs : dict @@ -238,7 +254,7 @@ class XGBRegressorCVEarlyStopping(_BaseXGBCVEarlyStopping, RegressorMixin): def __init__( self, - test_size: float | None, + test_size: float | int | None, early_stopping_rounds: int, **kwargs: Any, ): @@ -260,9 +276,17 @@ class XGBClassifierCVEarlyStopping(_BaseXGBCVEarlyStopping, ClassifierMixin): Parameters ---------- - test_size : float + test_size : float or int or None The proportion of the data to use as the validation set for early - stopping. + stopping. If groups is used on `fit`, this parameter refers to the + number of groups, otherwise it refers to the number of samples. If + float, should be between 0.0 and 1.0 and represent the + proportion of the dataset to include in the test split. If int, + represents the absolute number. If None, the value is + set to the complement of the train size. If train_size is also None, + it will be set to 0.25 in the case of non-grouped data or 0.2 for + grouped data (scikit-learn's defaults for `train_test_split` and + `GroupShuffleSplit`). early_stopping_rounds : int The number of rounds to use for early stopping. **kwargs : dict @@ -272,7 +296,7 @@ class XGBClassifierCVEarlyStopping(_BaseXGBCVEarlyStopping, ClassifierMixin): def __init__( self, - test_size: float | None, + test_size: float | int | None, early_stopping_rounds: int, **kwargs: Any, ): @@ -297,7 +321,7 @@ def fit( The data to fit the model on. y : DataLike The target data. - groups : DataLike, optional + groups : DataLike or None The group labels for the samples used while splitting the dataset into train/test set for early stopping. If None, standard train/test split is used, by default None.