diff --git a/causalml/inference/meta/base.py b/causalml/inference/meta/base.py
index badb670b..9a948c7b 100644
--- a/causalml/inference/meta/base.py
+++ b/causalml/inference/meta/base.py
@@ -1,10 +1,9 @@
 from abc import ABCMeta, abstractmethod
-import copy
 import logging
 import numpy as np
 import pandas as pd
 from joblib import Parallel, delayed
-from sklearn.base import clone
+from sklearn.base import BaseEstimator, clone
 from tqdm import tqdm
 
 from causalml.inference.meta.explainer import Explainer
@@ -18,7 +17,9 @@ def _fit_bootstrap_clone(learner_template, X, treatment, y, p, seed, bootstrap_s
     """Module-level bootstrap helper for joblib pickling compatibility.
 
     Args:
-        learner_template: an unfitted template to clone
+        learner_template: an *unfitted* learner to clone as a template.
+            Because BaseLearner now inherits BaseEstimator, ``clone(learner_template)``
+            produces a clean unfitted copy via ``get_params``/``set_params``.
         X: feature matrix
         treatment: treatment vector
         y: outcome vector
@@ -34,12 +35,31 @@ def _fit_bootstrap_clone(learner_template, X, treatment, y, p, seed, bootstrap_s
     treatment_b = treatment[idxs]
     y_b = y[idxs]
     p_b = {group: _p[idxs] for group, _p in p.items()} if p is not None else None
-    learner_b = clone(learner_template, safe=False)
+    learner_b = clone(learner_template)  # safe=True works now via get_params/set_params
     learner_b.fit(X=X_b, treatment=treatment_b, y=y_b, p=p_b)
     return learner_b
 
 
-class BaseLearner(metaclass=ABCMeta):
+class BaseLearner(BaseEstimator, metaclass=ABCMeta):
+    """Base class for all causalml meta-learners.
+
+    Inheriting ``sklearn.base.BaseEstimator`` gives every subclass:
+    * ``get_params`` / ``set_params`` for free (requires verbatim ``__init__``
+      argument storage — see scikit-learn conventions).
+    * ``sklearn.base.clone`` support without ``safe=False``.
+    * ``Pipeline`` / ``GridSearchCV`` compatibility.
+
+    Subclass contract
+    -----------------
+    * ``__init__`` **must** store every argument verbatim as ``self.<param> = param``.
+      No logic, no ``deepcopy``, no derived attributes.
+    * All model construction and validation moves to ``fit()``.
+    * ``fit()`` deepcopies the verbatim-stored arg before fitting, so ``self.learner``
+      (and related params) remain unfitted across repeated ``fit()`` calls — this is
+      the warm-start invariant that replaces the old ``_model_*_template`` mechanism.
+    * ``__repr__`` is inherited from ``BaseEstimator`` and reflects constructor params.
+    """
+
     @classmethod
     @abstractmethod
     def fit(self, X, treatment, y, p=None):
@@ -99,11 +119,6 @@ def bootstrap(self, X, treatment, y, p=None, size=10000, rng=None):
         self.fit(X=X_b, treatment=treatment_b, y=y_b, p=p_b)
         return self.predict(X=X, p=p)
 
-    def _unfitted_clone(self):
-        """Return an unfitted copy for bootstrap refitting. Subclasses that hold fitted
-        sub-models should override to reset them to their unfitted templates."""
-        return clone(self, safe=False)
-
     def fit_bootstrap_ensemble(
         self,
         X,
@@ -121,12 +136,11 @@ def fit_bootstrap_ensemble(
         and stores them in self.bootstrap_models_. Used by predict(return_ci=True)
         to compute percentile-based confidence intervals on new data without refitting.
 
-        This design follows EconML's BootstrapEstimator pattern — each bootstrap
-        clone is a full copy of the learner, making this method generic across all
-        meta-learners.
-
-        Note: storing N bootstrap clones can be memory-intensive for heavy base
-        learners. Monitor RAM for large n_bootstraps.
+        Because ``BaseLearner`` now inherits ``BaseEstimator``, ``clone(self)``
+        produces a clean unfitted copy via ``get_params``/``set_params``. The
+        warm-start invariant — that ``self.learner`` stays unfitted across calls —
+        is maintained by each ``fit()`` deepcopying the verbatim-stored constructor
+        arg before fitting it.
 
         Args:
             X (np.matrix or np.array or pd.Dataframe): a feature matrix
@@ -138,15 +152,16 @@ def fit_bootstrap_ensemble(
             random_state (int, optional): random seed for reproducibility.
             n_jobs (int, optional): number of parallel jobs. -1 uses all cores. Default: 1.
         """
+        # clone(self) is now a proper sklearn clone — unfitted and cheap.
+        unfitted_template = clone(self)
 
         rng = np.random.RandomState(random_state)
         seeds = rng.randint(0, np.iinfo(np.int32).max, size=n_bootstraps)
         logger.info("Storing bootstrap ensemble ({} iterations)".format(n_bootstraps))
 
-        learner_template = self._unfitted_clone()
         self.bootstrap_models_ = Parallel(n_jobs=n_jobs)(
             delayed(_fit_bootstrap_clone)(
-                learner_template, X, treatment, y, p, s, bootstrap_size
+                unfitted_template, X, treatment, y, p, s, bootstrap_size
             )
             for s in tqdm(seeds)
         )
diff --git a/causalml/inference/meta/drlearner.py b/causalml/inference/meta/drlearner.py
index a5b051dc..b1d04451 100644
--- a/causalml/inference/meta/drlearner.py
+++ b/causalml/inference/meta/drlearner.py
@@ -46,44 +46,22 @@ def __init__(
             treatment_effect_learner (optional): a model to estimate treatment effects in the treatment group
             ate_alpha (float, optional): the confidence level alpha of the ATE estimate
             control_name (str or int, optional): name of control group
-        """
-        assert (learner is not None) or (
-            (control_outcome_learner is not None)
-            and (treatment_outcome_learner is not None)
-            and (treatment_effect_learner is not None)
-        )
-
-        if control_outcome_learner is None:
-            self.model_mu_c = deepcopy(learner)
-        else:
-            self.model_mu_c = control_outcome_learner
-
-        if treatment_outcome_learner is None:
-            self.model_mu_t = deepcopy(learner)
-        else:
-            self.model_mu_t = treatment_outcome_learner
-
-        if treatment_effect_learner is None:
-            self.model_tau = deepcopy(learner)
-        else:
-            self.model_tau = treatment_effect_learner
 
+        Note: arguments are stored verbatim (scikit-learn convention) so that
+        ``get_params`` / ``clone`` work correctly. Model construction is deferred to ``fit()``.
+        Per the scikit-learn convention, ``__init__`` does not validate or raise —
+        validation happens in ``fit()``.
+        """
+        # Store verbatim — no deepcopy, no logic (scikit-learn convention).
+        self.learner = learner
+        self.control_outcome_learner = control_outcome_learner
+        self.treatment_outcome_learner = treatment_outcome_learner
+        self.treatment_effect_learner = treatment_effect_learner
         self.ate_alpha = ate_alpha
         self.control_name = control_name
-
-        self.propensity = None
-
-    def __repr__(self):
-        return (
-            "{}(control_outcome_learner={},\n"
-            "\ttreatment_outcome_learner={},\n"
-            "\ttreatment_effect_learner={})".format(
-                self.__class__.__name__,
-                self.model_mu_c.__repr__(),
-                self.model_mu_t.__repr__(),
-                self.model_tau.__repr__(),
-            )
-        )
+        # Sentinel so estimate_ate(pretrain=True) raises a clean ValueError
+        # instead of AttributeError when called before fit().
+        self.propensity = {}
 
     def fit(self, X, treatment, y, p=None, seed=None):
         """Fit the inference model.
@@ -92,17 +70,42 @@ def fit(self, X, treatment, y, p=None, seed=None):
             X (np.matrix or np.array or pd.Dataframe): a feature matrix
             treatment (np.array or pd.Series): a treatment vector
             y (np.array or pd.Series): an outcome vector
-            p (np.ndarray or pd.Series or dict, optional): an array of propensity scores of float (0,1) in the
-                single-treatment case; or, a dictionary of treatment groups that map to propensity vectors of
-                float (0,1); if None will run ElasticNetPropensityModel() to generate the propensity scores.
+            p (np.ndarray or pd.Series or dict, optional): propensity scores
             seed (int): random seed for cross-fitting
         """
+        if (self.learner is None) and (
+            (self.control_outcome_learner is None)
+            or (self.treatment_outcome_learner is None)
+            or (self.treatment_effect_learner is None)
+        ):
+            raise ValueError(
+                "Either `learner` or all three of `control_outcome_learner`, "
+                "`treatment_outcome_learner`, and `treatment_effect_learner` "
+                "must be specified."
+            )
         X, treatment, y = convert_pd_to_np(X, treatment, y)
         check_treatment_vector(treatment, self.control_name)
         self.t_groups = np.unique(treatment[treatment != self.control_name])
         self.t_groups.sort()
         self._classes = {group: i for i, group in enumerate(self.t_groups)}
 
+        # Resolve base models from stored constructor args (scikit-learn convention).
+        _control_outcome_learner = (
+            self.control_outcome_learner
+            if self.control_outcome_learner is not None
+            else deepcopy(self.learner)
+        )
+        _treatment_outcome_learner = (
+            self.treatment_outcome_learner
+            if self.treatment_outcome_learner is not None
+            else deepcopy(self.learner)
+        )
+        _treatment_effect_learner = (
+            self.treatment_effect_learner
+            if self.treatment_effect_learner is not None
+            else deepcopy(self.learner)
+        )
+
         # The estimator splits the data into 3 partitions for cross-fit on the propensity score estimation,
         # the outcome regression, and the treatment regression on the doubly robust estimates. The use of
         # the partitions is rotated so we do not lose on the sample size.
@@ -110,23 +113,23 @@ def fit(self, X, treatment, y, p=None, seed=None):
         split_indices = [index for _, index in cv.split(y)]
 
         self.models_mu_c = [
-            deepcopy(self.model_mu_c),
-            deepcopy(self.model_mu_c),
-            deepcopy(self.model_mu_c),
+            deepcopy(_control_outcome_learner),
+            deepcopy(_control_outcome_learner),
+            deepcopy(_control_outcome_learner),
         ]
         self.models_mu_t = {
             group: [
-                deepcopy(self.model_mu_t),
-                deepcopy(self.model_mu_t),
-                deepcopy(self.model_mu_t),
+                deepcopy(_treatment_outcome_learner),
+                deepcopy(_treatment_outcome_learner),
+                deepcopy(_treatment_outcome_learner),
             ]
             for group in self.t_groups
         }
         self.models_tau = {
             group: [
-                deepcopy(self.model_tau),
-                deepcopy(self.model_tau),
-                deepcopy(self.model_tau),
+                deepcopy(_treatment_effect_learner),
+                deepcopy(_treatment_effect_learner),
+                deepcopy(_treatment_effect_learner),
             ]
             for group in self.t_groups
         }
@@ -200,6 +203,7 @@ def fit(self, X, treatment, y, p=None, seed=None):
                     - mu_c
                 )
                 self.models_tau[group][ifold].fit(X_filt, dr)
+        return self
 
     def bootstrap(self, X, treatment, y, p=None, size=10000, rng=None, seed=None):
         """Runs a single bootstrap with optional deterministic cross-fit seed."""
@@ -237,9 +241,7 @@ def predict(
         te = np.zeros((X.shape[0], self.t_groups.shape[0]))
         yhat_ts = {}
 
-        # models_mu_c is fold-specific but not group-specific; predict once and reuse.
         yhat_c = np.r_[[model.predict(X) for model in self.models_mu_c]].mean(axis=0)
-        # Shared-reference dict preserves the public yhat_cs[group] API cheaply.
         yhat_cs = {group: yhat_c for group in self.t_groups}
 
         for i, group in enumerate(self.t_groups):
@@ -281,15 +283,13 @@ def fit_predict(
         verbose=True,
         seed=None,
     ):
-        """Fit the treatment effect and outcome models of the R learner and predict treatment effects.
+        """Fit the DR-learner and predict treatment effects.
 
         Args:
             X (np.matrix or np.array or pd.Dataframe): a feature matrix
             treatment (np.array or pd.Series): a treatment vector
             y (np.array or pd.Series): an outcome vector
-            p (np.ndarray or pd.Series or dict, optional): an array of propensity scores of float (0,1) in the
-                single-treatment case; or, a dictionary of treatment groups that map to propensity vectors of
-                float (0,1); if None will run ElasticNetPropensityModel() to generate the propensity scores.
+            p (np.ndarray or pd.Series or dict, optional): propensity scores
             return_ci (bool): whether to return confidence intervals
             n_bootstraps (int): number of bootstrap iterations
             bootstrap_size (int): number of samples per bootstrap
@@ -297,9 +297,7 @@ def fit_predict(
             verbose (str): whether to output progress logs
             seed (int): random seed for cross-fitting
         Returns:
-            (numpy.ndarray): Predictions of treatment effects. Output dim: [n_samples, n_treatment]
-                If return_ci, returns CATE [n_samples, n_treatment], LB [n_samples, n_treatment],
-                UB [n_samples, n_treatment]
+            (numpy.ndarray): Predictions of treatment effects.
         """
         X, treatment, y = convert_pd_to_np(X, treatment, y)
         self.fit(X, treatment, y, p, seed)
@@ -331,7 +329,6 @@ def fit_predict(
             te_bootstraps = np.zeros(
                 shape=(X.shape[0], self.t_groups.shape[0], n_bootstraps)
             )
-            # seed controls both bootstrap resampling and cross-fit randomness.
             rng = np.random.default_rng(seed) if seed is not None else None
 
             logger.info("Bootstrap Confidence Intervals")
@@ -357,7 +354,6 @@ def fit_predict(
                 te_bootstraps, (1 - self.ate_alpha / 2) * 100, axis=2
             )
 
-            # set member variables back to global (currently last bootstrapped outcome)
             self.t_groups = t_groups_global
             self._classes = _classes_global
             self.models_mu_c = deepcopy(models_mu_c_global)
@@ -384,9 +380,7 @@ def estimate_ate(
             X (np.matrix or np.array or pd.Dataframe): a feature matrix
             treatment (np.array or pd.Series): a treatment vector
             y (np.array or pd.Series): an outcome vector
-            p (np.ndarray or pd.Series or dict, optional): an array of propensity scores of float (0,1) in the
-                single-treatment case; or, a dictionary of treatment groups that map to propensity vectors of
-                float (0,1); if None will run ElasticNetPropensityModel() to generate the propensity scores.
+            p (np.ndarray or pd.Series or dict, optional): propensity scores
             bootstrap_ci (bool): whether run bootstrap for confidence intervals
             n_bootstraps (int): number of bootstrap iterations
             bootstrap_size (int): number of samples per bootstrap
@@ -396,6 +390,10 @@ def estimate_ate(
             The mean and confidence interval (LB, UB) of the ATE estimate.
         """
         if pretrain:
+            if not hasattr(self, "t_groups"):
+                raise ValueError(
+                    "No fitted model found. Call fit() before estimate_ate(pretrain=True)."
+                )
             te, yhat_cs, yhat_ts = self.predict(
                 X, treatment, y, p, return_components=True
             )
@@ -433,8 +431,6 @@ def estimate_ate(
             yhat_t = yhat_ts[group][mask]
             y_filt = y[mask]
 
-            # SE formula is based on the lower bound formula (7) from Imbens, Guido W., and Jeffrey M. Wooldridge. 2009.
-            # "Recent Developments in the Econometrics of Program Evaluation." Journal of Economic Literature
             se = np.sqrt(
                 (
                     (y_filt[w == 0] - yhat_c[w == 0]).var() / (1 - prob_treatment)
@@ -462,7 +458,6 @@ def estimate_ate(
 
             logger.info("Bootstrap Confidence Intervals for ATE")
             ate_bootstraps = np.zeros(shape=(self.t_groups.shape[0], n_bootstraps))
-            # seed controls both bootstrap resampling and cross-fit randomness.
             rng = np.random.default_rng(seed) if seed is not None else None
 
             for n in tqdm(range(n_bootstraps)):
@@ -489,7 +484,6 @@ def estimate_ate(
                 ate_bootstraps, (1 - self.ate_alpha / 2) * 100, axis=1
             )
 
-            # set member variables back to global (currently last bootstrapped outcome)
             self.t_groups = t_groups_global
             self._classes = _classes_global
             self.models_mu_c = deepcopy(models_mu_c_global)
@@ -512,17 +506,6 @@ def __init__(
         ate_alpha=0.05,
         control_name=0,
     ):
-        """Initialize an DR-learner regressor.
-
-        Args:
-            learner (optional): a model to estimate outcomes and treatment effects in both the control and treatment
-                groups
-            control_outcome_learner (optional): a model to estimate outcomes in the control group
-            treatment_outcome_learner (optional): a model to estimate outcomes in the treatment group
-            treatment_effect_learner (optional): a model to estimate treatment effects in the treatment group
-            ate_alpha (float, optional): the confidence level alpha of the ATE estimate
-            control_name (str or int, optional): name of control group
-        """
         super().__init__(
             learner=learner,
             control_outcome_learner=control_outcome_learner,
@@ -547,20 +530,6 @@ def __init__(
         ate_alpha=0.05,
         control_name=0,
     ):
-        """Initialize a DR-learner classifier.
-
-        Args:
-            learner (optional): a model to estimate outcomes and treatment effects in both the control and treatment
-                groups. Should have a predict_proba() method for outcome models.
-            control_outcome_learner (optional): a model to estimate outcomes in the control group.
-                Should have a predict_proba() method.
-            treatment_outcome_learner (optional): a model to estimate outcomes in the treatment group.
-                Should have a predict_proba() method.
-            treatment_effect_learner (optional): a model to estimate treatment effects in the treatment group.
-                Should be a regressor.
-            ate_alpha (float, optional): the confidence level alpha of the ATE estimate
-            control_name (str or int, optional): name of control group
-        """
         super().__init__(
             learner=learner,
             control_outcome_learner=control_outcome_learner,
@@ -573,32 +542,12 @@ def __init__(
     def predict(
         self, X, treatment=None, y=None, p=None, return_components=False, verbose=True
     ):
-        """Predict treatment effects.
-
-        Args:
-            X (np.matrix or np.array or pd.Dataframe): a feature matrix
-            treatment (np.array or pd.Series, optional): a treatment vector. Used for computing
-                classification metrics when y is also provided.
-            y (np.array or pd.Series, optional): an outcome vector. Used for computing
-                classification metrics when treatment is also provided.
-            p (np.ndarray or pd.Series or dict, optional): an array of propensity scores of float (0,1) in the
-                single-treatment case; or, a dictionary of treatment groups that map to propensity vectors of
-                float (0,1). Currently not used in prediction but kept for API consistency.
-            return_components (bool, optional): whether to return outcome probabilities for treatment and control
-                groups separately. Defaults to False.
-            verbose (bool, optional): whether to output progress logs. Defaults to True.
-        Returns:
-            (numpy.ndarray): Predictions of treatment effects.
-            If return_components is True, also returns:
-                - dict: Predicted probabilities for the control group (yhat_cs).
-                - dict: Predicted probabilities for the treatment group (yhat_ts).
-        """
+        """Predict treatment effects (classifier variant — uses predict_proba for outcomes)."""
         X, treatment, y = convert_pd_to_np(X, treatment, y)
 
         te = np.zeros((X.shape[0], self.t_groups.shape[0]))
         yhat_ts = {}
 
-        # models_mu_c is fold-specific but not group-specific; predict once and reuse.
         yhat_c = np.r_[
             [model.predict_proba(X)[:, 1] for model in self.models_mu_c]
         ].mean(axis=0)
diff --git a/causalml/inference/meta/rlearner.py b/causalml/inference/meta/rlearner.py
index 53563cce..1d76b15e 100644
--- a/causalml/inference/meta/rlearner.py
+++ b/causalml/inference/meta/rlearner.py
@@ -53,38 +53,23 @@ def __init__(
             random_state (int or RandomState, optional): a seed (int) or random number generator (RandomState)
             cv_n_jobs (int, optional): number of parallel jobs to run for cross_val_predict. -1 means using all
                 processors
-        """
-        assert (learner is not None) or (
-            (outcome_learner is not None) and (effect_learner is not None)
-        )
-        assert propensity_learner is not None
-
-        self.model_mu = (
-            outcome_learner if outcome_learner is not None else deepcopy(learner)
-        )
-        self.model_tau = (
-            effect_learner if effect_learner is not None else deepcopy(learner)
-        )
-        self.model_p = propensity_learner
 
+        Note: arguments are stored verbatim (scikit-learn convention) so that
+        ``get_params`` / ``clone`` work correctly. Model construction is deferred to ``fit()``.
+        Per the scikit-learn convention, ``__init__`` does not validate or raise —
+        validation of ``learner``/``outcome_learner``/``effect_learner`` happens in ``fit()``.
+        """
+        # Store verbatim — no deepcopy, no logic (scikit-learn convention).
+        self.learner = learner
+        self.outcome_learner = outcome_learner
+        self.effect_learner = effect_learner
+        self.propensity_learner = propensity_learner
         self.ate_alpha = ate_alpha
         self.control_name = control_name
-
+        self.n_fold = n_fold
         self.random_state = random_state
-        self.cv = KFold(n_splits=n_fold, shuffle=True, random_state=random_state)
         self.cv_n_jobs = cv_n_jobs
 
-        self.propensity = None
-        self.propensity_model = None
-
-    def __repr__(self):
-        return (
-            f"{self.__class__.__name__}\n"
-            f"\toutcome_learner={self.model_mu.__repr__()}\n"
-            f"\teffect_learner={self.model_tau.__repr__()}\n"
-            f"\tpropensity_learner={self.model_p.__repr__()}"
-        )
-
     def fit(self, X, treatment, y, p=None, sample_weight=None, verbose=True):
         """Fit the treatment effect and outcome models of the R learner.
 
@@ -92,13 +77,19 @@ def fit(self, X, treatment, y, p=None, sample_weight=None, verbose=True):
             X (np.matrix or np.array or pd.Dataframe): a feature matrix
             treatment (np.array or pd.Series): a treatment vector
             y (np.array or pd.Series): an outcome vector
-            p (np.ndarray or pd.Series or dict, optional): an array of propensity scores of float (0,1) in the
-                single-treatment case; or, a dictionary of treatment groups that map to propensity vectors of
-                float (0,1); if None will run ElasticNetPropensityModel() to generate the propensity scores.
-            sample_weight (np.array or pd.Series, optional): an array of sample weights indicating the
-                weight of each observation for `effect_learner`. If None, it assumes equal weight.
+            p (np.ndarray or pd.Series or dict, optional): propensity scores
+            sample_weight (np.array or pd.Series, optional): sample weights for `effect_learner`.
             verbose (bool, optional): whether to output progress logs
         """
+        if (self.learner is None) and (
+            (self.outcome_learner is None) or (self.effect_learner is None)
+        ):
+            raise ValueError(
+                "Either `learner` or both `outcome_learner` and `effect_learner` "
+                "must be specified."
+            )
+        if self.propensity_learner is None:
+            raise ValueError("`propensity_learner` must be specified.")
         X, treatment, y = convert_pd_to_np(X, treatment, y)
         check_treatment_vector(treatment, self.control_name)
         if sample_weight is not None:
@@ -106,6 +97,7 @@ def fit(self, X, treatment, y, p=None, sample_weight=None, verbose=True):
                 y
             ), "Data length must be equal for sample_weight and the input data"
             sample_weight = convert_pd_to_np(sample_weight)
+
         self.t_groups = np.unique(treatment[treatment != self.control_name])
         self.t_groups.sort()
 
@@ -116,6 +108,24 @@ def fit(self, X, treatment, y, p=None, sample_weight=None, verbose=True):
             p = self._format_p(p, self.t_groups)
 
         self._classes = {group: i for i, group in enumerate(self.t_groups)}
+
+        # Resolve base models from stored constructor args (scikit-learn convention).
+        self.model_mu = (
+            self.outcome_learner
+            if self.outcome_learner is not None
+            else deepcopy(self.learner)
+        )
+        self.model_tau = (
+            self.effect_learner
+            if self.effect_learner is not None
+            else deepcopy(self.learner)
+        )
+        self.model_p = self.propensity_learner
+        # Build CV splitter from stored n_fold / random_state.
+        self.cv = KFold(
+            n_splits=self.n_fold, shuffle=True, random_state=self.random_state
+        )
+
         self.models_tau = {group: deepcopy(self.model_tau) for group in self.t_groups}
         self.vars_c = {}
         self.vars_t = {}
@@ -142,7 +152,7 @@ def fit(self, X, treatment, y, p=None, sample_weight=None, verbose=True):
                 sample_weight_filt_t = sample_weight_filt[w == 1]
                 self.vars_c[group] = get_weighted_variance(diff_c, sample_weight_filt_c)
                 self.vars_t[group] = get_weighted_variance(diff_t, sample_weight_filt_t)
-                weight *= sample_weight_filt  # update weight
+                weight *= sample_weight_filt
             else:
                 self.vars_c[group] = diff_c.var()
                 self.vars_t[group] = diff_t.var()
@@ -156,6 +166,7 @@ def fit(self, X, treatment, y, p=None, sample_weight=None, verbose=True):
             self.models_tau[group].fit(
                 X_filt, (y_filt - yhat_filt) / (w - p_filt), sample_weight=weight
             )
+        return self
 
     def predict(self, X, p=None):
         """Predict treatment effects.
@@ -186,25 +197,20 @@ def fit_predict(
         bootstrap_size=10000,
         verbose=True,
     ):
-        """Fit the treatment effect and outcome models of the R learner and predict treatment effects.
+        """Fit the R learner and predict treatment effects.
 
         Args:
             X (np.matrix or np.array or pd.Dataframe): a feature matrix
             treatment (np.array or pd.Series): a treatment vector
             y (np.array or pd.Series): an outcome vector
-            p (np.ndarray or pd.Series or dict, optional): an array of propensity scores of float (0,1) in the
-                single-treatment case; or, a dictionary of treatment groups that map to propensity vectors of
-                float (0,1); if None will run ElasticNetPropensityModel() to generate the propensity scores.
-            sample_weight (np.array or pd.Series, optional): an array of sample weights indicating the
-                weight of each observation for `effect_learner`. If None, it assumes equal weight.
+            p (np.ndarray or pd.Series or dict, optional): propensity scores
+            sample_weight (np.array or pd.Series, optional): sample weights
             return_ci (bool): whether to return confidence intervals
             n_bootstraps (int): number of bootstrap iterations
             bootstrap_size (int): number of samples per bootstrap
             verbose (bool): whether to output progress logs
         Returns:
-            (numpy.ndarray): Predictions of treatment effects. Output dim: [n_samples, n_treatment].
-                If return_ci, returns CATE [n_samples, n_treatment], LB [n_samples, n_treatment],
-                UB [n_samples, n_treatment]
+            (numpy.ndarray): Predictions of treatment effects.
         """
         X, treatment, y = convert_pd_to_np(X, treatment, y)
         self.fit(X, treatment, y, p, sample_weight, verbose=verbose)
@@ -235,7 +241,6 @@ def fit_predict(
                 te_bootstraps, (1 - self.ate_alpha / 2) * 100, axis=2
             )
 
-            # set member variables back to global (currently last bootstrapped outcome)
             self.t_groups = t_groups_global
             self._classes = _classes_global
             self.model_mu = deepcopy(model_mu_global)
@@ -259,13 +264,10 @@ def estimate_ate(
 
         Args:
             X (np.matrix or np.array or pd.Dataframe): a feature matrix
-            treatment (np.array or pd.Series): only needed when pretrain=False, a treatment vector
-            y (np.array or pd.Series):only needed when pretrain=False, an outcome vector
-            p (np.ndarray or pd.Series or dict, optional): an array of propensity scores of float (0,1) in the
-                single-treatment case; or, a dictionary of treatment groups that map to propensity vectors of
-                float (0,1); if None will run ElasticNetPropensityModel() to generate the propensity scores.
-            sample_weight (np.array or pd.Series, optional): an array of sample weights indicating the
-                weight of each observation for `effect_learner`. If None, it assumes equal weight.
+            treatment (np.array or pd.Series): treatment vector (needed when pretrain=False)
+            y (np.array or pd.Series): outcome vector (needed when pretrain=False)
+            p (np.ndarray or pd.Series or dict, optional): propensity scores
+            sample_weight (np.array or pd.Series, optional): sample weights
             bootstrap_ci (bool): whether run bootstrap for confidence intervals
             n_bootstraps (int): number of bootstrap iterations
             bootstrap_size (int): number of samples per bootstrap
@@ -278,7 +280,7 @@ def estimate_ate(
             te = self.predict(X, p)
         else:
             if not len(treatment) or not len(y):
-                raise ValueError("treatmeng and y must be provided when pretrain=False")
+                raise ValueError("treatment and y must be provided when pretrain=False")
             te = self.fit_predict(X, treatment, y, p, sample_weight, return_ci=False)
 
         ate = np.zeros(self.t_groups.shape[0])
@@ -332,7 +334,6 @@ def estimate_ate(
                 ate_bootstraps, (1 - self.ate_alpha / 2) * 100, axis=1
             )
 
-            # set member variables back to global (currently last bootstrapped outcome)
             self.t_groups = t_groups_global
             self._classes = _classes_global
             self.model_mu = deepcopy(model_mu_global)
@@ -356,20 +357,6 @@ def __init__(
         n_fold=5,
         random_state=None,
     ):
-        """Initialize an R-learner regressor.
-
-        Args:
-            learner (optional): a model to estimate outcomes and treatment effects
-            outcome_learner (optional): a model to estimate outcomes
-            effect_learner (optional): a model to estimate treatment effects. It needs to take `sample_weight` as an
-                input argument for `fit()`
-            propensity_learner (optional): a model to estimate propensity scores. `ElasticNetPropensityModel()` will
-                be used by default.
-            ate_alpha (float, optional): the confidence level alpha of the ATE estimate
-            control_name (str or int, optional): name of control group
-            n_fold (int, optional): the number of cross validation folds for outcome_learner
-            random_state (int or RandomState, optional): a seed (int) or random number generator (RandomState)
-        """
         super().__init__(
             learner=learner,
             outcome_learner=outcome_learner,
@@ -400,16 +387,19 @@ def __init__(
         """Initialize an R-learner classifier.
 
         Args:
-            outcome_learner: a model to estimate outcomes. Should be a classifier.
-            effect_learner: a model to estimate treatment effects. It needs to take `sample_weight` as an
-                input argument for `fit()`. Should be a regressor.
-            propensity_learner (optional): a model to estimate propensity scores. `ElasticNetPropensityModel()` will
-                be used by default.
-            ate_alpha (float, optional): the confidence level alpha of the ATE estimate
+            outcome_learner: a classifier for outcomes.
+            effect_learner: a regressor for treatment effects (needs ``sample_weight`` in fit).
+            propensity_learner (optional): a propensity model. Defaults to ElasticNetPropensityModel.
+            ate_alpha (float, optional): confidence level alpha
             control_name (str or int, optional): name of control group
-            n_fold (int, optional): the number of cross validation folds for outcome_learner
-            random_state (int or RandomState, optional): a seed (int) or random number generator (RandomState)
+            n_fold (int, optional): CV folds for outcome_learner
+            random_state (int or RandomState, optional): random seed
         """
+        if (outcome_learner is None) and (effect_learner is None):
+            raise ValueError(
+                "Either the outcome learner or the effect learner must be specified."
+            )
+
         super().__init__(
             learner=None,
             outcome_learner=outcome_learner,
@@ -421,25 +411,8 @@ def __init__(
             random_state=random_state,
         )
 
-        if (outcome_learner is None) and (effect_learner is None):
-            raise ValueError(
-                "Either the outcome learner or the effect learner must be specified."
-            )
-
     def fit(self, X, treatment, y, p=None, sample_weight=None, verbose=True):
-        """Fit the treatment effect and outcome models of the R learner.
-
-        Args:
-            X (np.matrix or np.array or pd.Dataframe): a feature matrix
-            treatment (np.array or pd.Series): a treatment vector
-            y (np.array or pd.Series): an outcome vector
-            p (np.ndarray or pd.Series or dict, optional): an array of propensity scores of float (0,1) in the
-                single-treatment case; or, a dictionary of treatment groups that map to propensity vectors of
-                float (0,1); if None will run ElasticNetPropensityModel() to generate the propensity scores.
-            sample_weight (np.array or pd.Series, optional): an array of sample weights indicating the
-                weight of each observation for `effect_learner`. If None, it assumes equal weight.
-            verbose (bool, optional): whether to output progress logs
-        """
+        """Fit the R-learner classifier (uses predict_proba for outcome estimates)."""
         X, treatment, y = convert_pd_to_np(X, treatment, y)
         check_treatment_vector(treatment, self.control_name)
         if sample_weight is not None:
@@ -457,6 +430,15 @@ def fit(self, X, treatment, y, p=None, sample_weight=None, verbose=True):
             p = self._format_p(p, self.t_groups)
 
         self._classes = {group: i for i, group in enumerate(self.t_groups)}
+
+        # Resolve base models from stored constructor args.
+        self.model_mu = self.outcome_learner
+        self.model_tau = self.effect_learner
+        self.model_p = self.propensity_learner
+        self.cv = KFold(
+            n_splits=self.n_fold, shuffle=True, random_state=self.random_state
+        )
+
         self.models_tau = {group: deepcopy(self.model_tau) for group in self.t_groups}
         self.vars_c = {}
         self.vars_t = {}
@@ -485,7 +467,7 @@ def fit(self, X, treatment, y, p=None, sample_weight=None, verbose=True):
                 sample_weight_filt_t = sample_weight_filt[w == 1]
                 self.vars_c[group] = get_weighted_variance(diff_c, sample_weight_filt_c)
                 self.vars_t[group] = get_weighted_variance(diff_t, sample_weight_filt_t)
-                weight *= sample_weight_filt  # update weight
+                weight *= sample_weight_filt
             else:
                 self.vars_c[group] = diff_c.var()
                 self.vars_t[group] = diff_t.var()
@@ -499,16 +481,10 @@ def fit(self, X, treatment, y, p=None, sample_weight=None, verbose=True):
             self.models_tau[group].fit(
                 X_filt, (y_filt - yhat_filt) / (w - p_filt), sample_weight=weight
             )
+        return self
 
     def predict(self, X, p=None):
-        """Predict treatment effects.
-
-        Args:
-            X (np.matrix or np.array or pd.Dataframe): a feature matrix
-
-        Returns:
-            (numpy.ndarray): Predictions of treatment effects.
-        """
+        """Predict treatment effects."""
         X = convert_pd_to_np(X)
         te = np.zeros((X.shape[0], self.t_groups.shape[0]))
         for i, group in enumerate(self.t_groups):
@@ -519,6 +495,18 @@ def predict(self, X, p=None):
 
 
 class XGBRRegressor(BaseRRegressor):
+    """An R-learner regressor using XGBoost models.
+
+    Stores every constructor argument verbatim (scikit-learn convention) so
+    that ``get_params()`` / ``clone()`` work correctly. All XGBRegressor
+    construction is deferred to ``fit()``.
+
+    Additional XGBoost keyword arguments (e.g. ``max_depth``, ``learning_rate``)
+    are accepted via ``**xgb_kwargs`` and stored verbatim as ``self.xgb_kwargs``,
+    so that ``get_params()`` surfaces them and ``clone()`` round-trips them
+    correctly.
+    """
+
     def __init__(
         self,
         early_stopping=True,
@@ -527,73 +515,58 @@ def __init__(
         effect_learner_objective="reg:squarederror",
         effect_learner_n_estimators=500,
         random_state=42,
-        *args,
-        **kwargs,
+        ate_alpha=0.05,
+        control_name=0,
+        n_fold=5,
+        xgb_kwargs=None,
     ):
-        """Initialize an R-learner regressor with XGBoost model using pairwise ranking objective.
+        """Initialize an R-learner regressor with XGBoost models.
 
         Args:
-            early_stopping: whether or not to use early stopping when fitting effect learner
-            test_size (float, optional): the proportion of the dataset to use as validation set when early stopping is
-                                         enabled
-            early_stopping_rounds (int, optional): validation metric needs to improve at least once in every
-                                                   early_stopping_rounds round(s) to continue training
-            effect_learner_objective (str, optional): the learning objective for the effect learner
-                                                      (default = 'reg:squarederror')
-            effect_learner_n_estimators (int, optional): number of trees to fit for the effect learner (default = 500)
+            early_stopping (bool, optional): whether to use early stopping for the effect learner
+            test_size (float, optional): held-out fraction for early stopping eval set
+            early_stopping_rounds (int, optional): early stopping patience
+            effect_learner_objective (str, optional): XGBoost objective for the effect learner
+            effect_learner_n_estimators (int, optional): n_estimators for the effect learner
+            random_state (int, optional): random seed (must be int)
+            ate_alpha (float, optional): confidence level alpha of the ATE estimate
+            control_name (str or int, optional): name of control group
+            n_fold (int, optional): CV folds for the outcome learner
+            xgb_kwargs (dict, optional): additional keyword arguments forwarded verbatim
+                to both XGBRegressor instances (outcome and effect learners), e.g.
+                ``xgb_kwargs={'max_depth': 4, 'learning_rate': 0.05}``.
+
+        Note: all arguments are stored verbatim (scikit-learn convention) so that
+        ``get_params`` / ``clone`` work correctly. XGBRegressor construction is
+        deferred to ``fit()``.
         """
-
         assert isinstance(random_state, int), "random_state should be int."
 
-        objective, metric = get_xgboost_objective_metric(effect_learner_objective)
-        self.effect_learner_objective = objective
-        self.effect_learner_eval_metric = metric
-        self.effect_learner_n_estimators = effect_learner_n_estimators
+        # Store verbatim — no transformation, no XGBRegressor construction here.
+        # xgb_kwargs=None is stored as-is; BaseEstimator.get_params surfaces it
+        # correctly since it is a named parameter.  The or {} coalesce happens in
+        # fit() so that clone(XGBRRegressor()) still round-trips None → None.
         self.early_stopping = early_stopping
-        if self.early_stopping:
-            self.test_size = test_size
-            self.early_stopping_rounds = early_stopping_rounds
-
-            effect_learner = XGBRegressor(
-                objective=self.effect_learner_objective,
-                n_estimators=self.effect_learner_n_estimators,
-                eval_metric=self.effect_learner_eval_metric,
-                early_stopping_rounds=self.early_stopping_rounds,
-                random_state=random_state,
-                *args,
-                **kwargs,
-            )
-        else:
-            effect_learner = XGBRegressor(
-                objective=self.effect_learner_objective,
-                n_estimators=self.effect_learner_n_estimators,
-                eval_metric=self.effect_learner_eval_metric,
-                random_state=random_state,
-                *args,
-                **kwargs,
-            )
+        self.test_size = test_size
+        self.early_stopping_rounds = early_stopping_rounds
+        self.effect_learner_objective = effect_learner_objective
+        self.effect_learner_n_estimators = effect_learner_n_estimators
+        self.xgb_kwargs = xgb_kwargs
 
         super().__init__(
-            outcome_learner=XGBRegressor(random_state=random_state, *args, **kwargs),
-            effect_learner=effect_learner,
+            learner=None,
+            outcome_learner=None,
+            effect_learner=None,
+            ate_alpha=ate_alpha,
+            control_name=control_name,
+            n_fold=n_fold,
+            random_state=random_state,
         )
 
     def fit(self, X, treatment, y, p=None, sample_weight=None, verbose=True):
-        """Fit the treatment effect and outcome models of the R learner.
-
-        Args:
-            X (np.matrix or np.array or pd.Dataframe): a feature matrix
-            y (np.array or pd.Series): an outcome vector
-            p (np.ndarray or pd.Series or dict, optional): an array of propensity scores of float (0,1) in the
-                single-treatment case; or, a dictionary of treatment groups that map to propensity vectors of
-                float (0,1); if None will run ElasticNetPropensityModel() to generate the propensity scores.
-            sample_weight (np.array or pd.Series, optional): an array of sample weights indicating the
-                weight of each observation for `effect_learner`. If None, it assumes equal weight.
-            verbose (bool, optional): whether to output progress logs
-        """
+        """Fit using early-stopping XGBoost R-learner."""
         X, treatment, y = convert_pd_to_np(X, treatment, y)
         check_treatment_vector(treatment, self.control_name)
-        # initialize equal sample weight if it's not provided, for simplicity purpose
         sample_weight = (
             convert_pd_to_np(sample_weight)
             if sample_weight is not None
@@ -612,6 +585,38 @@ def fit(self, X, treatment, y, p=None, sample_weight=None, verbose=True):
             p = self._format_p(p, self.t_groups)
 
         self._classes = {group: i for i, group in enumerate(self.t_groups)}
+
+        # Resolve XGBRegressor models here (not in __init__) so get_params/clone
+        # stay correct — the constructor only stores plain, verbatim values.
+        # self.xgb_kwargs holds any extra XGBoost params (e.g. max_depth) verbatim.
+        objective, metric = get_xgboost_objective_metric(self.effect_learner_objective)
+        xgb_kw = self.xgb_kwargs or {}
+        if self.early_stopping:
+            effect_learner = XGBRegressor(
+                objective=objective,
+                n_estimators=self.effect_learner_n_estimators,
+                eval_metric=metric,
+                early_stopping_rounds=self.early_stopping_rounds,
+                random_state=self.random_state,
+                **xgb_kw,
+            )
+        else:
+            effect_learner = XGBRegressor(
+                objective=objective,
+                n_estimators=self.effect_learner_n_estimators,
+                eval_metric=metric,
+                random_state=self.random_state,
+                **xgb_kw,
+            )
+        outcome_learner = XGBRegressor(random_state=self.random_state, **xgb_kw)
+
+        self.model_mu = outcome_learner
+        self.model_tau = effect_learner
+        self.model_p = self.propensity_learner
+        self.cv = KFold(
+            n_splits=self.n_fold, shuffle=True, random_state=self.random_state
+        )
+
         self.models_tau = {group: deepcopy(self.model_tau) for group in self.t_groups}
         self.vars_c = {}
         self.vars_t = {}
@@ -679,7 +684,6 @@ def fit(self, X, treatment, y, p=None, sample_weight=None, verbose=True):
                     ],
                     verbose=verbose,
                 )
-
             else:
                 self.models_tau[group].fit(
                     X_filt,
@@ -693,3 +697,4 @@ def fit(self, X, treatment, y, p=None, sample_weight=None, verbose=True):
             sample_weight_filt_t = sample_weight_filt[w == 1]
             self.vars_c[group] = get_weighted_variance(diff_c, sample_weight_filt_c)
             self.vars_t[group] = get_weighted_variance(diff_t, sample_weight_filt_t)
+        return self
diff --git a/causalml/inference/meta/slearner.py b/causalml/inference/meta/slearner.py
index a5df639c..ee599e7e 100644
--- a/causalml/inference/meta/slearner.py
+++ b/causalml/inference/meta/slearner.py
@@ -36,6 +36,7 @@ def fit(self, X, y):
         self.model = sm.OLS(y, X).fit(cov_type=self.cov_type)
         self.coefficients = self.model.params
         self.conf_ints = self.model.conf_int(alpha=self.alpha)
+        return self
 
     def predict(self, X):
         # Append ones. The first column is for the treatment indicator.
@@ -51,22 +52,23 @@ class BaseSLearner(BaseLearner):
 
     def __init__(self, learner=None, ate_alpha=0.05, control_name=0):
         """Initialize an S-learner.
+
         Args:
-            learner (optional): a model to estimate the treatment effect
+            learner (optional): a model to estimate the treatment effect.
+                If None, a DummyRegressor is used.  The argument is stored
+                verbatim so that ``get_params`` / ``clone`` work correctly
+                (scikit-learn convention).
+            ate_alpha (float, optional): the confidence level alpha of the ATE estimate
             control_name (str or int, optional): name of control group
         """
-        if learner is not None:
-            self.model = learner
-        else:
-            self.model = DummyRegressor()
+        # Store verbatim — no deepcopy, no logic (scikit-learn convention).
+        self.learner = learner
         self.ate_alpha = ate_alpha
         self.control_name = control_name
 
-    def __repr__(self):
-        return "{}(model={})".format(self.__class__.__name__, self.model.__repr__())
-
     def fit(self, X, treatment, y, p=None):
-        """Fit the inference model
+        """Fit the inference model.
+
         Args:
             X (np.matrix, np.array, or pd.Dataframe): a feature matrix
             treatment (np.array or pd.Series): a treatment vector
@@ -77,7 +79,10 @@ def fit(self, X, treatment, y, p=None):
         self.t_groups = np.unique(treatment[treatment != self.control_name])
         self.t_groups.sort()
         self._classes = {group: i for i, group in enumerate(self.t_groups)}
-        self.models = {group: deepcopy(self.model) for group in self.t_groups}
+
+        # Resolve the base model here (not in __init__) so clone() works cleanly.
+        _base_model = self.learner if self.learner is not None else DummyRegressor()
+        self.models = {group: deepcopy(_base_model) for group in self.t_groups}
 
         for group in self.t_groups:
             mask = (treatment == group) | (treatment == self.control_name)
@@ -88,6 +93,7 @@ def fit(self, X, treatment, y, p=None):
             w = (treatment_filt == group).astype(int)
             X_new = np.hstack((w.reshape((-1, 1)), X_filt))
             self.models[group].fit(X_new, y_filt)
+        return self
 
     def predict(
         self, X, treatment=None, y=None, p=None, return_components=False, verbose=True
diff --git a/causalml/inference/meta/tlearner.py b/causalml/inference/meta/tlearner.py
index a1aef665..749ac127 100644
--- a/causalml/inference/meta/tlearner.py
+++ b/causalml/inference/meta/tlearner.py
@@ -1,4 +1,3 @@
-import copy
 from copy import deepcopy
 import logging
 import numpy as np
@@ -46,44 +45,18 @@ def __init__(
             treatment_learner (model, optional): a model to estimate treatment outcomes
             ate_alpha (float, optional): the confidence level alpha of the ATE estimate
             control_name (str or int, optional): name of control group
-        """
-        assert (learner is not None) or (
-            (control_learner is not None) and (treatment_learner is not None)
-        )
-
-        if control_learner is None:
-            self.model_c = deepcopy(learner)
-        else:
-            self.model_c = control_learner
-
-        # Preserve the unfitted template so repeated fit() calls always start fresh.
-        self._model_c_template = self.model_c
-
-        if treatment_learner is None:
-            self.model_t = deepcopy(learner)
-        else:
-            self.model_t = treatment_learner
-
-        # Preserve the unfitted template so repeated fit() calls always start fresh.
-        self._model_t_template = self.model_t
 
+        Note: arguments are stored verbatim (scikit-learn convention) so that
+        ``get_params`` / ``clone`` work correctly. Model construction is deferred
+        to ``fit()``. Per the scikit-learn convention, ``__init__`` does not
+        validate or raise — validation happens in ``fit()``.
+        """
+        # Store verbatim — no deepcopy, no logic (scikit-learn convention).
+        self.learner = learner
+        self.control_learner = control_learner
+        self.treatment_learner = treatment_learner
         self.ate_alpha = ate_alpha
         self.control_name = control_name
-        self.bootstrap_models_ = None
-
-    def __repr__(self):
-        return "{}(model_c={}, model_t={})".format(
-            self.__class__.__name__, self.model_c.__repr__(), self.model_t.__repr__()
-        )
-
-    def _unfitted_clone(self):
-        template = copy.copy(self)
-        for attr in ("models_c", "models_t", "bootstrap_models_"):
-            if hasattr(template, attr):
-                delattr(template, attr)
-        template.model_c = self._model_c_template
-        template.model_t = self._model_t_template
-        return template
 
     @ignore_warnings(category=ConvergenceWarning)
     def fit(
@@ -109,26 +82,41 @@ def fit(
                 during fit and stores it in self.bootstrap_models_ for post-fit CI
                 estimation via predict(return_ci=True). Default: False.
             n_bootstraps (int, optional): number of bootstrap iterations. Default: 200.
-                Note: storing N bootstraps of a GBM-based learner with k treatment
-                groups holds 2*N*k model objects in memory. Monitor RAM for large N
-                or heavy base learners.
             n_jobs (int, optional): number of parallel jobs for bootstrap fitting.
                 -1 uses all available cores. Default: 1.
             bootstrap_size (int, optional): number of samples per bootstrap. Default: 10000.
             random_state (int, optional): random seed for reproducible bootstrap sampling.
         """
+        if (self.learner is None) and (
+            (self.control_learner is None) or (self.treatment_learner is None)
+        ):
+            raise ValueError(
+                "Either `learner` or both `control_learner` and `treatment_learner` "
+                "must be specified."
+            )
         X, treatment, y = convert_pd_to_np(X, treatment, y)
         check_treatment_vector(treatment, self.control_name)
         self.t_groups = np.unique(treatment[treatment != self.control_name])
         self.t_groups.sort()
         self._classes = {group: i for i, group in enumerate(self.t_groups)}
-        self.models_t = {group: deepcopy(self.model_t) for group in self.t_groups}
 
-        # model_c is trained on the control group, which is identical for every
-        # treatment group, so fit it once. Deepcopy from the unfitted template so
-        # re-calling fit() always starts from a clean state (safe with warm_start).
+        # Resolve base models from stored constructor args (no templates needed).
+        _control_learner = (
+            self.control_learner
+            if self.control_learner is not None
+            else deepcopy(self.learner)
+        )
+        _treatment_learner = (
+            self.treatment_learner
+            if self.treatment_learner is not None
+            else deepcopy(self.learner)
+        )
+
+        self.models_t = {group: deepcopy(_treatment_learner) for group in self.t_groups}
+
+        # model_c is trained on the control group, identical for every treatment group.
         control_mask = treatment == self.control_name
-        self.model_c = deepcopy(self._model_c_template)
+        self.model_c = deepcopy(_control_learner)
         self.model_c.fit(X[control_mask], y[control_mask])
         # Expose as a shared-reference dict to preserve the public models_c API.
         self.models_c = {group: self.model_c for group in self.t_groups}
@@ -149,6 +137,7 @@ def fit(
             )
         else:
             self.bootstrap_models_ = None
+        return self
 
     def _compute_bootstrap_ci(self, X):
         """Compute bootstrap CI using stored ensemble.
@@ -192,12 +181,10 @@ def predict(
             verbose (bool, optional): whether to output progress logs
             return_ci (bool, optional): whether to return confidence intervals
                 using the stored bootstrap ensemble. Requires fit() to have been
-                called with store_bootstraps=True. CI width is controlled by
-                self.ate_alpha set at init time.
+                called with store_bootstraps=True.
         Returns:
             (numpy.ndarray): Predictions of treatment effects. If return_ci=True,
                 returns (te, te_lower, te_upper) each of shape [n_samples, n_treatment].
-                return_ci=True and return_components=True cannot be used together.
         """
         if return_ci and return_components:
             raise ValueError("return_ci and return_components cannot both be True.")
@@ -206,8 +193,6 @@ def predict(
         yhat_ts = {}
 
         yhat_c = self.model_c.predict(X)
-        # Build a shared-reference dict so return_components callers keep the
-        # yhat_cs[group] indexing API without duplicating the underlying array.
         yhat_cs = {group: yhat_c for group in self.t_groups}
 
         for group in self.t_groups:
@@ -321,9 +306,9 @@ def estimate_ate(
             bootstrap_ci (bool): whether to return confidence intervals
             n_bootstraps (int): number of bootstrap iterations
             bootstrap_size (int): number of samples per bootstrap
+            pretrain (bool): whether a model has been fit, default False.
         Returns:
             The mean and confidence interval (LB, UB) of the ATE estimate.
-            pretrain (bool): whether a model has been fit, default False.
         """
         X, treatment, y = convert_pd_to_np(X, treatment, y)
         if pretrain:
@@ -478,6 +463,8 @@ def predict(
         Returns:
             (numpy.ndarray): Predictions of treatment effects.
         """
+        # Fail-fast: validate mutually exclusive flags before doing any work.
+        # Consistent with BaseTLearner.predict which checks at the top.
         if return_ci and return_components:
             raise ValueError("return_ci and return_components cannot both be True.")
 
diff --git a/causalml/inference/meta/xlearner.py b/causalml/inference/meta/xlearner.py
index 968fa870..3bb64cca 100644
--- a/causalml/inference/meta/xlearner.py
+++ b/causalml/inference/meta/xlearner.py
@@ -43,56 +43,24 @@ def __init__(
             treatment_effect_learner (optional): a model to estimate treatment effects in the treatment group
             ate_alpha (float, optional): the confidence level alpha of the ATE estimate
             control_name (str or int, optional): name of control group
-        """
-        assert (learner is not None) or (
-            (control_outcome_learner is not None)
-            and (treatment_outcome_learner is not None)
-            and (control_effect_learner is not None)
-            and (treatment_effect_learner is not None)
-        )
-
-        if control_outcome_learner is None:
-            self.model_mu_c = deepcopy(learner)
-        else:
-            self.model_mu_c = control_outcome_learner
-
-        # Preserve the unfitted template so repeated fit() calls always start fresh.
-        self._model_mu_c_template = self.model_mu_c
-
-        if treatment_outcome_learner is None:
-            self.model_mu_t = deepcopy(learner)
-        else:
-            self.model_mu_t = treatment_outcome_learner
-
-        if control_effect_learner is None:
-            self.model_tau_c = deepcopy(learner)
-        else:
-            self.model_tau_c = control_effect_learner
-
-        if treatment_effect_learner is None:
-            self.model_tau_t = deepcopy(learner)
-        else:
-            self.model_tau_t = treatment_effect_learner
 
+        Note: arguments are stored verbatim (scikit-learn convention) so that
+        ``get_params`` / ``clone`` work correctly. Model construction is deferred to ``fit()``.
+        Per the scikit-learn convention, ``__init__`` does not validate or raise —
+        validation happens in ``fit()``.
+        """
+        # Store verbatim — no deepcopy, no logic (scikit-learn convention).
+        self.learner = learner
+        self.control_outcome_learner = control_outcome_learner
+        self.treatment_outcome_learner = treatment_outcome_learner
+        self.control_effect_learner = control_effect_learner
+        self.treatment_effect_learner = treatment_effect_learner
         self.ate_alpha = ate_alpha
         self.control_name = control_name
-
-        self.propensity = None
-        self.propensity_model = None
-
-    def __repr__(self):
-        return (
-            "{}(control_outcome_learner={},\n"
-            "\ttreatment_outcome_learner={},\n"
-            "\tcontrol_effect_learner={},\n"
-            "\ttreatment_effect_learner={})".format(
-                self.__class__.__name__,
-                self.model_mu_c.__repr__(),
-                self.model_mu_t.__repr__(),
-                self.model_tau_c.__repr__(),
-                self.model_tau_t.__repr__(),
-            )
-        )
+        # Sentinel so estimate_ate(pretrain=True) raises a clean ValueError
+        # ("no propensity score, please call fit() first") instead of AttributeError
+        # when called before fit().
+        self.propensity = {}
 
     def fit(self, X, treatment, y, p=None):
         """Fit the inference model.
@@ -105,6 +73,17 @@ def fit(self, X, treatment, y, p=None):
                 single-treatment case; or, a dictionary of treatment groups that map to propensity vectors of
                 float (0,1); if None will run ElasticNetPropensityModel() to generate the propensity scores.
         """
+        if (self.learner is None) and (
+            (self.control_outcome_learner is None)
+            or (self.treatment_outcome_learner is None)
+            or (self.control_effect_learner is None)
+            or (self.treatment_effect_learner is None)
+        ):
+            raise ValueError(
+                "Either `learner` or all four of `control_outcome_learner`, "
+                "`treatment_outcome_learner`, `control_effect_learner`, and "
+                "`treatment_effect_learner` must be specified."
+            )
         X, treatment, y = convert_pd_to_np(X, treatment, y)
         check_treatment_vector(treatment, self.control_name)
         self.t_groups = np.unique(treatment[treatment != self.control_name])
@@ -117,28 +96,49 @@ def fit(self, X, treatment, y, p=None):
             p = self._format_p(p, self.t_groups)
 
         self._classes = {group: i for i, group in enumerate(self.t_groups)}
-        self.models_mu_t = {group: deepcopy(self.model_mu_t) for group in self.t_groups}
+
+        # Resolve base models from stored constructor args (no templates needed).
+        _control_outcome_learner = (
+            self.control_outcome_learner
+            if self.control_outcome_learner is not None
+            else deepcopy(self.learner)
+        )
+        _treatment_outcome_learner = (
+            self.treatment_outcome_learner
+            if self.treatment_outcome_learner is not None
+            else deepcopy(self.learner)
+        )
+        _control_effect_learner = (
+            self.control_effect_learner
+            if self.control_effect_learner is not None
+            else deepcopy(self.learner)
+        )
+        _treatment_effect_learner = (
+            self.treatment_effect_learner
+            if self.treatment_effect_learner is not None
+            else deepcopy(self.learner)
+        )
+
+        self.models_mu_t = {
+            group: deepcopy(_treatment_outcome_learner) for group in self.t_groups
+        }
         self.models_tau_c = {
-            group: deepcopy(self.model_tau_c) for group in self.t_groups
+            group: deepcopy(_control_effect_learner) for group in self.t_groups
         }
         self.models_tau_t = {
-            group: deepcopy(self.model_tau_t) for group in self.t_groups
+            group: deepcopy(_treatment_effect_learner) for group in self.t_groups
         }
         self.vars_c = {}
         self.vars_t = {}
 
-        # model_mu_c is trained on control data, which is the same for every treatment
-        # group. Deepcopy from the unfitted template so re-calling fit() starts fresh.
+        # model_mu_c is trained on control data, identical for every treatment group.
         control_mask = treatment == self.control_name
-        self.model_mu_c = deepcopy(self._model_mu_c_template)
+        self.model_mu_c = deepcopy(_control_outcome_learner)
         self.model_mu_c.fit(X[control_mask], y[control_mask])
-        # Expose as a shared-reference dict to preserve the public models_mu_c API.
         self.models_mu_c = {group: self.model_mu_c for group in self.t_groups}
 
-        # var_c depends only on model_mu_c and control data — constant across groups.
         y_control_pred = self.model_mu_c.predict(X[control_mask])
         self.var_c = (y[control_mask] - y_control_pred).var()
-        # Keep vars_c dict for backward compatibility with existing callers.
         self.vars_c = {group: self.var_c for group in self.t_groups}
 
         for group in self.t_groups:
@@ -152,11 +152,11 @@ def fit(self, X, treatment, y, p=None):
                 y_treat - self.models_mu_t[group].predict(X_treat)
             ).var()
 
-            # Train treatment effect models using cross-group imputation
             d_c = self.models_mu_t[group].predict(X[control_mask]) - y[control_mask]
             d_t = y_treat - self.model_mu_c.predict(X_treat)
             self.models_tau_c[group].fit(X[control_mask], d_c)
             self.models_tau_t[group].fit(X_treat, d_t)
+        return self
 
     def predict(
         self, X, treatment=None, y=None, p=None, return_components=False, verbose=True
@@ -167,9 +167,7 @@ def predict(
             X (np.matrix or np.array or pd.Dataframe): a feature matrix
             treatment (np.array or pd.Series, optional): a treatment vector
             y (np.array or pd.Series, optional): an outcome vector
-            p (np.ndarray or pd.Series or dict, optional): an array of propensity scores of float (0,1) in the
-                single-treatment case; or, a dictionary of treatment groups that map to propensity vectors of
-                float (0,1); if None will run ElasticNetPropensityModel() to generate the propensity scores.
+            p (np.ndarray or pd.Series or dict, optional): propensity scores
             return_components (bool, optional): whether to return outcome for treatment and control seperately
             verbose (bool, optional): whether to output progress logs
         Returns:
@@ -190,7 +188,6 @@ def predict(
         dhat_cs = {}
         dhat_ts = {}
 
-        # For verbose metrics, control predictions are constant across groups.
         yhat_c_verbose = None
         if (y is not None) and (treatment is not None) and verbose:
             control_mask = treatment == self.control_name
@@ -238,24 +235,20 @@ def fit_predict(
         return_components=False,
         verbose=True,
     ):
-        """Fit the treatment effect and outcome models of the R learner and predict treatment effects.
+        """Fit the X-learner and predict treatment effects.
 
         Args:
             X (np.matrix or np.array or pd.Dataframe): a feature matrix
             treatment (np.array or pd.Series): a treatment vector
             y (np.array or pd.Series): an outcome vector
-            p (np.ndarray or pd.Series or dict, optional): an array of propensity scores of float (0,1) in the
-                single-treatment case; or, a dictionary of treatment groups that map to propensity vectors of
-                float (0,1); if None will run ElasticNetPropensityModel() to generate the propensity scores.
+            p (np.ndarray or pd.Series or dict, optional): propensity scores
             return_ci (bool): whether to return confidence intervals
             n_bootstraps (int): number of bootstrap iterations
             bootstrap_size (int): number of samples per bootstrap
             return_components (bool, optional): whether to return outcome for treatment and control seperately
             verbose (str): whether to output progress logs
         Returns:
-            (numpy.ndarray): Predictions of treatment effects. Output dim: [n_samples, n_treatment]
-                If return_ci, returns CATE [n_samples, n_treatment], LB [n_samples, n_treatment],
-                UB [n_samples, n_treatment]
+            (numpy.ndarray): Predictions of treatment effects.
         """
         X, treatment, y = convert_pd_to_np(X, treatment, y)
         self.fit(X, treatment, y, p)
@@ -292,7 +285,6 @@ def fit_predict(
                 te_bootstraps, (1 - self.ate_alpha / 2) * 100, axis=2
             )
 
-            # set member variables back to global (currently last bootstrapped outcome)
             self.t_groups = t_groups_global
             self._classes = _classes_global
             self.model_mu_c = deepcopy(model_mu_c_global)
@@ -320,9 +312,7 @@ def estimate_ate(
             X (np.matrix or np.array or pd.Dataframe): a feature matrix
             treatment (np.array or pd.Series): a treatment vector
             y (np.array or pd.Series): an outcome vector
-            p (np.ndarray or pd.Series or dict, optional): an array of propensity scores of float (0,1) in the
-                single-treatment case; or, a dictionary of treatment groups that map to propensity vectors of
-                float (0,1); if None will run ElasticNetPropensityModel() to generate the propensity scores.
+            p (np.ndarray or pd.Series or dict, optional): propensity scores
             bootstrap_ci (bool): whether run bootstrap for confidence intervals
             n_bootstraps (int): number of bootstrap iterations
             bootstrap_size (int): number of samples per bootstrap
@@ -332,7 +322,6 @@ def estimate_ate(
         """
         if pretrain:
             if p is None:
-                # when p is null, use pretrain propensity score
                 if not self.propensity:
                     raise ValueError("no propensity score, please call fit() first")
                 te, dhat_cs, dhat_ts = self.predict(
@@ -370,8 +359,6 @@ def estimate_ate(
             dhat_t = dhat_ts[group][mask]
             p_filt = p[group][mask]
 
-            # SE formula is based on the lower bound formula (7) from Imbens, Guido W., and Jeffrey M. Wooldridge. 2009.
-            # "Recent Developments in the Econometrics of Program Evaluation." Journal of Economic Literature
             se = np.sqrt(
                 (
                     self.vars_t[group] / prob_treatment
@@ -412,7 +399,6 @@ def estimate_ate(
                 ate_bootstraps, (1 - self.ate_alpha / 2) * 100, axis=1
             )
 
-            # set member variables back to global (currently last bootstrapped outcome)
             self.t_groups = t_groups_global
             self._classes = _classes_global
             self.model_mu_c = deepcopy(model_mu_c_global)
@@ -438,18 +424,6 @@ def __init__(
         ate_alpha=0.05,
         control_name=0,
     ):
-        """Initialize an X-learner regressor.
-
-        Args:
-            learner (optional): a model to estimate outcomes and treatment effects in both the control and treatment
-                groups
-            control_outcome_learner (optional): a model to estimate outcomes in the control group
-            treatment_outcome_learner (optional): a model to estimate outcomes in the treatment group
-            control_effect_learner (optional): a model to estimate treatment effects in the control group
-            treatment_effect_learner (optional): a model to estimate treatment effects in the treatment group
-            ate_alpha (float, optional): the confidence level alpha of the ATE estimate
-            control_name (str or int, optional): name of control group
-        """
         super().__init__(
             learner=learner,
             control_outcome_learner=control_outcome_learner,
@@ -480,56 +454,44 @@ def __init__(
         """Initialize an X-learner classifier.
 
         Args:
-            outcome_learner (optional): a model to estimate outcomes in both the control and treatment groups.
-                Should be a classifier.
-            effect_learner (optional): a model to estimate treatment effects in both the control and treatment groups.
-                Should be a regressor.
-            control_outcome_learner (optional): a model to estimate outcomes in the control group.
-                Should be a classifier.
-            treatment_outcome_learner (optional): a model to estimate outcomes in the treatment group.
-                Should be a classifier.
-            control_effect_learner (optional): a model to estimate treatment effects in the control group.
-                Should be a regressor.
-            treatment_effect_learner (optional): a model to estimate treatment effects in the treatment group
-                Should be a regressor.
-            ate_alpha (float, optional): the confidence level alpha of the ATE estimate
+            outcome_learner (optional): a classifier for outcomes in both groups.
+            effect_learner (optional): a regressor for treatment effects in both groups.
+            control_outcome_learner (optional): a classifier for control outcomes.
+            treatment_outcome_learner (optional): a classifier for treatment outcomes.
+            control_effect_learner (optional): a regressor for control effects.
+            treatment_effect_learner (optional): a regressor for treatment effects.
+            ate_alpha (float, optional): confidence level alpha of the ATE estimate
             control_name (str or int, optional): name of control group
         """
-        if outcome_learner is not None:
-            control_outcome_learner = outcome_learner
-            treatment_outcome_learner = outcome_learner
-        if effect_learner is not None:
-            control_effect_learner = effect_learner
-            treatment_effect_learner = effect_learner
+        # Store all args verbatim (scikit-learn convention) — no resolution here.
+        self.outcome_learner = outcome_learner
+        self.effect_learner = effect_learner
+        self.control_outcome_learner = control_outcome_learner
+        self.treatment_outcome_learner = treatment_outcome_learner
+        self.control_effect_learner = control_effect_learner
+        self.treatment_effect_learner = treatment_effect_learner
+        self.ate_alpha = ate_alpha
+        self.control_name = control_name
+        # Sentinel so estimate_ate(pretrain=True) raises cleanly before fit().
+        self.propensity = {}
 
-        super().__init__(
-            learner=None,
-            control_outcome_learner=control_outcome_learner,
-            treatment_outcome_learner=treatment_outcome_learner,
-            control_effect_learner=control_effect_learner,
-            treatment_effect_learner=treatment_effect_learner,
-            ate_alpha=ate_alpha,
-            control_name=control_name,
+    def fit(self, X, treatment, y, p=None):
+        """Fit the inference model (classifier variant — uses predict_proba)."""
+        # Resolve and validate here (not in __init__) — sklearn convention.
+        _control_outcome_learner = self.control_outcome_learner or self.outcome_learner
+        _treatment_outcome_learner = (
+            self.treatment_outcome_learner or self.outcome_learner
         )
+        _control_effect_learner = self.control_effect_learner or self.effect_learner
+        _treatment_effect_learner = self.treatment_effect_learner or self.effect_learner
 
         if (
-            (control_outcome_learner is None) or (treatment_outcome_learner is None)
-        ) and ((control_effect_learner is None) or (treatment_effect_learner is None)):
+            _control_outcome_learner is None or _treatment_outcome_learner is None
+        ) and (_control_effect_learner is None or _treatment_effect_learner is None):
             raise ValueError(
                 "Either the outcome learner or the effect learner pair must be specified."
             )
 
-    def fit(self, X, treatment, y, p=None):
-        """Fit the inference model.
-
-        Args:
-            X (np.matrix or np.array or pd.Dataframe): a feature matrix
-            treatment (np.array or pd.Series): a treatment vector
-            y (np.array or pd.Series): an outcome vector
-            p (np.ndarray or pd.Series or dict, optional): an array of propensity scores of float (0,1) in the
-                single-treatment case; or, a dictionary of treatment groups that map to propensity vectors of
-                float (0,1); if None will run ElasticNetPropensityModel() to generate the propensity scores.
-        """
         X, treatment, y = convert_pd_to_np(X, treatment, y)
         check_treatment_vector(treatment, self.control_name)
         self.t_groups = np.unique(treatment[treatment != self.control_name])
@@ -542,24 +504,24 @@ def fit(self, X, treatment, y, p=None):
             p = self._format_p(p, self.t_groups)
 
         self._classes = {group: i for i, group in enumerate(self.t_groups)}
-        self.models_mu_t = {group: deepcopy(self.model_mu_t) for group in self.t_groups}
+
+        self.models_mu_t = {
+            group: deepcopy(_treatment_outcome_learner) for group in self.t_groups
+        }
         self.models_tau_c = {
-            group: deepcopy(self.model_tau_c) for group in self.t_groups
+            group: deepcopy(_control_effect_learner) for group in self.t_groups
         }
         self.models_tau_t = {
-            group: deepcopy(self.model_tau_t) for group in self.t_groups
+            group: deepcopy(_treatment_effect_learner) for group in self.t_groups
         }
         self.vars_c = {}
         self.vars_t = {}
 
-        # model_mu_c is trained on control data, which is the same for every treatment
-        # group, so fit it once and store as a single model (not a per-group dict).
         control_mask = treatment == self.control_name
-        self.model_mu_c = deepcopy(self._model_mu_c_template)
+        self.model_mu_c = deepcopy(_control_outcome_learner)
         self.model_mu_c.fit(X[control_mask], y[control_mask])
         self.models_mu_c = {group: self.model_mu_c for group in self.t_groups}
 
-        # var_c depends only on model_mu_c and control data — constant across groups.
         y_control_pred = self.model_mu_c.predict_proba(X[control_mask])[:, 1]
         self.var_c = (y[control_mask] - y_control_pred).var()
         self.vars_c = {group: self.var_c for group in self.t_groups}
@@ -575,7 +537,6 @@ def fit(self, X, treatment, y, p=None):
                 y_treat - self.models_mu_t[group].predict_proba(X_treat)[:, 1]
             ).var()
 
-            # Train treatment effect models using cross-group imputation
             d_c = (
                 self.models_mu_t[group].predict_proba(X[control_mask])[:, 1]
                 - y[control_mask]
@@ -583,25 +544,12 @@ def fit(self, X, treatment, y, p=None):
             d_t = y_treat - self.model_mu_c.predict_proba(X_treat)[:, 1]
             self.models_tau_c[group].fit(X[control_mask], d_c)
             self.models_tau_t[group].fit(X_treat, d_t)
+        return self
 
     def predict(
         self, X, treatment=None, y=None, p=None, return_components=False, verbose=True
     ):
-        """Predict treatment effects.
-
-        Args:
-            X (np.matrix or np.array or pd.Dataframe): a feature matrix
-            treatment (np.array or pd.Series, optional): a treatment vector
-            y (np.array or pd.Series, optional): an outcome vector
-            p (np.ndarray or pd.Series or dict, optional): an array of propensity scores of float (0,1) in the
-                single-treatment case; or, a dictionary of treatment groups that map to propensity vectors of
-                float (0,1); if None will run ElasticNetPropensityModel() to generate the propensity scores.
-            return_components (bool, optional): whether to return outcome for treatment and control seperately
-            return_p_score (bool, optional): whether to return propensity score
-            verbose (bool, optional): whether to output progress logs
-        Returns:
-            (numpy.ndarray): Predictions of treatment effects.
-        """
+        """Predict treatment effects (classifier variant — uses predict_proba)."""
         X, treatment, y = convert_pd_to_np(X, treatment, y)
 
         if p is None:
@@ -617,7 +565,6 @@ def predict(
         dhat_cs = {}
         dhat_ts = {}
 
-        # For verbose metrics, control predictions are constant across groups.
         yhat_c_verbose = None
         if (y is not None) and (treatment is not None) and verbose:
             control_mask = treatment == self.control_name
diff --git a/tests/test_meta_learners.py b/tests/test_meta_learners.py
index 1f87046c..2efb77b0 100644
--- a/tests/test_meta_learners.py
+++ b/tests/test_meta_learners.py
@@ -2,9 +2,11 @@
 import pandas as pd
 import pytest
 
+from sklearn.base import clone
 from sklearn.linear_model import LinearRegression
 from sklearn.linear_model import LogisticRegression
 from sklearn.model_selection import train_test_split
+from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
 from xgboost import XGBRegressor
 from xgboost import XGBClassifier
 from sklearn.ensemble import RandomForestRegressor
@@ -1318,7 +1320,7 @@ def test_multi_treatment_learners():
     Covers: BaseTLearner, BaseXLearner, BaseSLearner, BaseDRLearner, BaseRLearner.
 
     Shared return-type contracts (regression learners below):
-      - ``fit(...)`` → ``None``
+      - ``fit(...)`` → ``self`` (sklearn convention; enables method chaining)
       - ``predict(...)`` → ``np.ndarray`` of shape ``(n_samples, n_treatment_groups)``
       - ``predict(..., return_components=True)`` → ``tuple`` of length 3 ``(te, comp_a, comp_b)``
         (not implemented for R-learner; its ``predict`` only returns CATE).
@@ -1407,8 +1409,9 @@ def _assert_shared_ref_dict(d, single_obj, keys, name, attr):
             d[g] is single_obj for g in keys
         ), f"{name}: all {attr} values must be shared refs to the single fitted model"
 
-    def _assert_fit_returns_none(result, name):
-        assert result is None, f"{name}.fit(): expected None, got {type(result)}"
+    def _assert_fit_returns_self(result, learner, name):
+        """fit() must return self (sklearn convention enabling method chaining)."""
+        assert result is learner, f"{name}.fit(): expected self, got {type(result)}"
 
     def _assert_plain_fit_predict(result, name):
         """fit_predict(return_ci=False) must return a single ndarray (CATE), not a tuple."""
@@ -1419,7 +1422,7 @@ def _assert_plain_fit_predict(result, name):
     # ── T-Learner ─────────────────────────────────────────────────────────────
     name = "BaseTLearner"
     tl = BaseTLearner(learner=LinearRegression())
-    _assert_fit_returns_none(tl.fit(X=X, treatment=treatment, y=y), name)
+    _assert_fit_returns_self(tl.fit(X=X, treatment=treatment, y=y), tl, name)
 
     _assert_fit_attrs(tl, name)
     assert hasattr(tl, "model_c"), f"{name}: missing model_c"
@@ -1467,7 +1470,9 @@ def _assert_plain_fit_predict(result, name):
     # ── X-Learner ─────────────────────────────────────────────────────────────
     name = "BaseXLearner"
     xl = BaseXLearner(learner=LinearRegression())
-    _assert_fit_returns_none(xl.fit(X=X, treatment=treatment, y=y, p=p_scores), name)
+    _assert_fit_returns_self(
+        xl.fit(X=X, treatment=treatment, y=y, p=p_scores), xl, name
+    )
 
     _assert_fit_attrs(xl, name)
     assert hasattr(xl, "model_mu_c"), f"{name}: missing model_mu_c"
@@ -1527,7 +1532,7 @@ def _assert_plain_fit_predict(result, name):
     # ── S-Learner ─────────────────────────────────────────────────────────────
     name = "BaseSLearner"
     sl = BaseSLearner(learner=LinearRegression())
-    _assert_fit_returns_none(sl.fit(X=X, treatment=treatment, y=y), name)
+    _assert_fit_returns_self(sl.fit(X=X, treatment=treatment, y=y), sl, name)
 
     _assert_fit_attrs(sl, name)
     assert hasattr(sl, "models") and isinstance(
@@ -1579,7 +1584,7 @@ def _assert_plain_fit_predict(result, name):
     dr = BaseDRLearner(
         learner=LinearRegression(), treatment_effect_learner=LinearRegression()
     )
-    _assert_fit_returns_none(dr.fit(X=X, treatment=treatment, y=y), name)
+    _assert_fit_returns_self(dr.fit(X=X, treatment=treatment, y=y), dr, name)
 
     _assert_fit_attrs(dr, name)
     # models_mu_c: list of 3 fold models (fold-specific, NOT per-group).
@@ -1635,8 +1640,8 @@ def _assert_plain_fit_predict(result, name):
         effect_learner=LinearRegression(),
         cv_n_jobs=1,
     )
-    _assert_fit_returns_none(
-        rl.fit(X=X, treatment=treatment, y=y, p=p_scores, verbose=False), name
+    _assert_fit_returns_self(
+        rl.fit(X=X, treatment=treatment, y=y, p=p_scores, verbose=False), rl, name
     )
 
     _assert_fit_attrs(rl, name)
@@ -1724,3 +1729,244 @@ def test_BaseTClassifier_predict_return_ci(generate_classification_data):
     # Test 4: old API unchanged
     tau_plain = learner.predict(X)
     assert tau_plain.shape == (X.shape[0], len(learner.t_groups))
+
+
+# =============================================================================
+# sklearn compliance tests — issue #911
+# =============================================================================
+
+# Fixed-seed data (independent of conftest fixtures so tests are self-contained)
+_RNG = np.random.RandomState(42)
+_N = 400
+_X = _RNG.randn(_N, 5)
+_TREATMENT = _RNG.choice([0, 1], _N)
+_Y_CONT = _X[:, 0] + _TREATMENT * 0.5 + _RNG.randn(_N) * 0.1
+_Y_BIN = (_Y_CONT > 0).astype(int)
+_DT_REG = DecisionTreeRegressor(random_state=0, max_depth=3)
+_DT_CLF = DecisionTreeClassifier(random_state=0, max_depth=3)
+
+
+def _sklearn_fit(learner, y=None):
+    """Fit with the right outcome type; returns whatever fit() returns."""
+    if y is None:
+        y = _Y_BIN if "Classifier" in type(learner).__name__ else _Y_CONT
+    return learner.fit(_X, _TREATMENT, y)
+
+
+def _sklearn_predict(learner, **kw):
+    """Call predict(), omitting verbose for R-learners which don't accept it."""
+    from causalml.inference.meta.rlearner import BaseRLearner
+
+    if isinstance(learner, BaseRLearner):
+        return learner.predict(_X, **kw)
+    return learner.predict(_X, verbose=False, **kw)
+
+
+def _params_repr_equal(p1, p2):
+    """Compare get_params() dicts via repr() — safe for numpy arrays and estimators."""
+    if p1.keys() != p2.keys():
+        return False
+    return all(repr(p1[k]) == repr(p2[k]) for k in p1)
+
+
+_REGRESSOR_CONFIGS = [
+    (BaseSRegressor, {"learner": _DT_REG}),
+    (BaseTRegressor, {"learner": _DT_REG}),
+    (BaseXRegressor, {"learner": _DT_REG}),
+    (BaseDRRegressor, {"learner": _DT_REG}),
+    (BaseRRegressor, {"learner": _DT_REG}),
+]
+
+_CLASSIFIER_CONFIGS = [
+    (BaseSClassifier, {"learner": _DT_CLF}),
+    (BaseTClassifier, {"learner": _DT_CLF}),
+    (BaseXClassifier, {"outcome_learner": _DT_CLF, "effect_learner": _DT_REG}),
+]
+
+
+@pytest.mark.parametrize("Cls,kwargs", _REGRESSOR_CONFIGS)
+def test_clone_get_params_regressor(Cls, kwargs):
+    """clone() / get_params() round-trip — all regressors."""
+    m = Cls(**kwargs)
+    params = m.get_params()
+    assert isinstance(params, dict)
+    m2 = clone(m)
+    assert type(m2) is type(m)
+    assert _params_repr_equal(
+        m2.get_params(), params
+    ), f"{Cls.__name__}: clone params mismatch"
+    assert not hasattr(m2, "t_groups"), "clone() must return an unfitted estimator"
+
+
+@pytest.mark.parametrize("Cls,kwargs", _CLASSIFIER_CONFIGS)
+def test_clone_get_params_classifier(Cls, kwargs):
+    """clone() / get_params() round-trip — all classifiers (botched-merge check)."""
+    m = Cls(**kwargs)
+    params = m.get_params()
+    assert isinstance(params, dict)
+    m2 = clone(m)
+    assert type(m2) is type(m)
+    assert _params_repr_equal(m2.get_params(), params)
+    assert not hasattr(m2, "t_groups")
+
+
+@pytest.mark.parametrize("Cls,kwargs", _REGRESSOR_CONFIGS)
+def test_fit_returns_self_regressor(Cls, kwargs):
+    """fit() returns self for all regressors (Pipeline / GridSearchCV requirement)."""
+    m = Cls(**kwargs)
+    assert _sklearn_fit(m) is m, f"{Cls.__name__}.fit() must return self"
+
+
+@pytest.mark.parametrize("Cls,kwargs", _CLASSIFIER_CONFIGS)
+def test_fit_returns_self_classifier(Cls, kwargs):
+    """fit() returns self for all classifiers."""
+    m = Cls(**kwargs)
+    assert _sklearn_fit(m) is m, f"{Cls.__name__}.fit() must return self"
+
+
+def test_xgb_rregressor_clone_no_kwargs():
+    """XGBRRegressor() (no extra kwargs) clones cleanly; xgb_kwargs stored verbatim."""
+    r = XGBRRegressor()
+    params = r.get_params()
+    assert "xgb_kwargs" in params and params["xgb_kwargs"] is None
+    r2 = clone(r)
+    assert r2.xgb_kwargs is None
+    assert _params_repr_equal(r2.get_params(), r.get_params())
+
+
+def test_xgb_rregressor_clone_with_kwargs():
+    """XGBRRegressor(xgb_kwargs={...}) round-trips through clone / get_params."""
+    r = XGBRRegressor(xgb_kwargs={"max_depth": 3, "learning_rate": 0.05})
+    assert r.get_params()["xgb_kwargs"] == {"max_depth": 3, "learning_rate": 0.05}
+    r2 = clone(r)
+    assert r2.xgb_kwargs == {"max_depth": 3, "learning_rate": 0.05}
+    assert _params_repr_equal(r2.get_params(), r.get_params())
+
+
+def test_xgb_rregressor_fit_predict_return_ci():
+    """XGBRRegressor.fit_predict(return_ci=True) exercises clone(self) in bootstrap.
+
+    Before the BaseEstimator refactor, clone(self, safe=False) silently deepcopied
+    a fitted model on every bootstrap iteration.
+    """
+    r = XGBRRegressor(effect_learner_n_estimators=20, random_state=0)
+    te, lo, hi = r.fit_predict(
+        _X, _TREATMENT, _Y_CONT, return_ci=True, n_bootstraps=5, bootstrap_size=200
+    )
+    assert te.shape == lo.shape == hi.shape == (_N, 1)
+    assert np.all(lo <= hi), "CI lower bound must be <= upper bound"
+
+
+@pytest.mark.parametrize(
+    "Cls,kwargs",
+    [
+        (BaseSRegressor, {"learner": _DT_REG}),
+        (BaseTRegressor, {"learner": _DT_REG}),
+        (BaseXRegressor, {"learner": _DT_REG}),
+    ],
+)
+def test_bit_identical_predict(Cls, kwargs):
+    """Two independently fitted clones produce identical predictions (deterministic learners)."""
+    m1 = Cls(**kwargs)
+    _sklearn_fit(m1)
+    te1 = _sklearn_predict(m1)
+    m2 = clone(m1)
+    _sklearn_fit(m2)
+    te2 = _sklearn_predict(m2)
+    np.testing.assert_array_equal(
+        te1,
+        te2,
+        err_msg=f"{Cls.__name__}: predict not bit-identical across clone+refit",
+    )
+
+
+@pytest.mark.parametrize(
+    "Cls,kwargs",
+    [(BaseDRRegressor, {"learner": _DT_REG}), (BaseRRegressor, {"learner": _DT_REG})],
+)
+def test_finite_predict_stochastic(Cls, kwargs):
+    """DR and R learners (internal shuffle-splits) produce finite predictions."""
+    m1 = Cls(**kwargs)
+    _sklearn_fit(m1)
+    te1 = _sklearn_predict(m1)
+    m2 = clone(m1)
+    _sklearn_fit(m2)
+    te2 = _sklearn_predict(m2)
+    assert te1.shape == te2.shape
+    assert np.isfinite(te1).all() and np.isfinite(te2).all()
+
+
+def test_bit_identical_estimate_ate_s():
+    """S-learner estimate_ate() CI triple is bit-identical across clone+refit."""
+    m1 = BaseSRegressor(learner=_DT_REG)
+    ate1, lb1, ub1 = m1.estimate_ate(_X, _TREATMENT, _Y_CONT, return_ci=True)
+    m2 = clone(m1)
+    ate2, lb2, ub2 = m2.estimate_ate(_X, _TREATMENT, _Y_CONT, return_ci=True)
+    np.testing.assert_array_equal(ate1, ate2)
+    np.testing.assert_array_equal(lb1, lb2)
+    np.testing.assert_array_equal(ub1, ub2)
+
+
+def test_t_regressor_return_ci_bit_identical():
+    """TLearner stored bootstrap ensemble is reproducible with fixed random_state."""
+    kw = dict(store_bootstraps=True, n_bootstraps=10, random_state=7)
+    m1 = BaseTRegressor(learner=_DT_REG)
+    m1.fit(_X, _TREATMENT, _Y_CONT, **kw)
+    te1, lo1, hi1 = m1.predict(_X, return_ci=True, verbose=False)
+    m2 = BaseTRegressor(learner=_DT_REG)
+    m2.fit(_X, _TREATMENT, _Y_CONT, **kw)
+    te2, lo2, hi2 = m2.predict(_X, return_ci=True, verbose=False)
+    np.testing.assert_array_equal(te1, te2)
+    np.testing.assert_array_equal(lo1, lo2)
+    np.testing.assert_array_equal(hi1, hi2)
+
+
+def test_t_classifier_clone_fit_predict():
+    """BaseTClassifier clone round-trip and fit/predict smoke test."""
+    m = BaseTClassifier(learner=_DT_CLF)
+    m2 = clone(m)
+    assert _params_repr_equal(m2.get_params(), m.get_params())
+    _sklearn_fit(m2)
+    assert _sklearn_predict(m2).shape == (_N, 1)
+
+
+def test_x_classifier_verbatim_store_clone():
+    """BaseXClassifier stores outcome/effect learner verbatim (not pre-resolved)."""
+    m = BaseXClassifier(outcome_learner=_DT_CLF, effect_learner=_DT_REG)
+    params = m.get_params()
+    assert params["outcome_learner"] is _DT_CLF
+    assert params["control_outcome_learner"] is None  # verbatim, not resolved
+    m2 = clone(m)
+    assert type(m2.outcome_learner) is type(_DT_CLF)
+    assert m2.outcome_learner is not _DT_CLF  # cloned copy
+    assert m2.control_outcome_learner is None
+    _sklearn_fit(m2)
+    assert _sklearn_predict(m2).shape == (_N, 1)
+
+
+def test_t_classifier_fail_fast_return_ci_and_components():
+    """BaseTClassifier.predict raises before any computation when both flags set."""
+    m = BaseTClassifier(learner=_DT_CLF)
+    _sklearn_fit(m)
+    with pytest.raises(ValueError, match="cannot both be True"):
+        m.predict(_X, return_ci=True, return_components=True, verbose=False)
+
+
+def test_x_learner_pretrain_before_fit_raises():
+    """BaseXRegressor.estimate_ate(pretrain=True) before fit() raises ValueError."""
+    with pytest.raises(ValueError):
+        BaseXRegressor(learner=_DT_REG).estimate_ate(
+            _X, _TREATMENT, _Y_CONT, pretrain=True
+        )
+
+
+def test_dr_learner_pretrain_before_fit_raises():
+    """BaseDRRegressor.estimate_ate(pretrain=True) before fit() raises ValueError.
+
+    self.propensity = {} sentinel in __init__ (consistent with BaseXLearner) ensures
+    a clean ValueError rather than an AttributeError on missing t_groups.
+    """
+    with pytest.raises(ValueError):
+        BaseDRRegressor(learner=_DT_REG).estimate_ate(
+            _X, _TREATMENT, _Y_CONT, pretrain=True
+        )