PolicyEngine · MaxGhenis · Apr 17, 2026 · Apr 17, 2026
diff --git a/changelog.d/fix-holdout-state-restore.fixed.md b/changelog.d/fix-holdout-state-restore.fixed.md
@@ -0,0 +1 @@
+Fix holdout state restoration and seed-namespace collision. `evaluate_holdout_robustness` and `tune_l0_hyperparameters` only restored captured attributes whose value was not None, so `excluded_targets=None` callers had the last holdout set's target list silently stick to the calibrator. Restoration is now unconditional. Robustness evaluation now uses `seed + 10_000` when generating its holdout sets instead of `seed + 1`, avoiding the deterministic index-aligned collision with tuning's holdout sets that leaked tuning data into "independent" evaluation.
diff --git a/src/microcalibrate/evaluation.py b/src/microcalibrate/evaluation.py
@@ -513,9 +513,17 @@ def evaluate_holdout_robustness(
         ),
     }
 
-    # Create holdout sets
+    # Create holdout sets. Use an orthogonal seed namespace so that the
+    # sets generated here do not collide with those generated by
+    # tune_l0_hyperparameters (which uses seed + i). Without this
+    # offset, holdout set #1 of tuning (rng = seed + 1) would equal
+    # holdout set #0 of robustness evaluation (rng = (seed + 1) + 0),
+    # leaking tuning holdouts into "independent" evaluation.
+    robustness_seed_namespace = (
+        calibration.seed + 10_000 if calibration.seed is not None else None
+    )
     holdout_sets = calibration._create_holdout_sets(
-        n_holdout_sets, holdout_fraction, calibration.seed + 1
+        n_holdout_sets, holdout_fraction, robustness_seed_namespace
     )
 
     # Collect results
@@ -551,16 +559,22 @@ def evaluate_holdout_robustness(
                         detail["within_10pct"]
                     )
     finally:
-        # Restore original state
+        # Restore original state unconditionally. The previous
+        # implementation only restored keys whose captured value was
+        # not None, which meant attributes that started as None (e.g.
+        # ``excluded_targets``) were left at the last holdout set's
+        # value -- a latent bug that caused silent data leakage on any
+        # subsequent calibrate() call.
         for key, value in original_state.items():
-            if value is not None:
-                setattr(
-                    calibration,
-                    key,
-                    value.copy() if hasattr(value, "copy") else value,
-                )
-        if calibration.excluded_targets:
-            calibration.exclude_targets()
+            setattr(
+                calibration,
+                key,
+                value.copy() if hasattr(value, "copy") else value,
+            )
+        # Refresh derived state (targets / target_names / etc.) to match
+        # the restored inputs, regardless of whether excluded_targets is
+        # None or a list.
+        calibration.exclude_targets()
 
     if not all_results:
         raise ValueError("No successful holdout evaluations completed")

diff --git a/src/microcalibrate/hyperparameter_tuning.py b/src/microcalibrate/hyperparameter_tuning.py
@@ -274,10 +274,22 @@ def objective(trial: optuna.Trial) -> float:
             return 1e10
 
         finally:
-            # Restore original state
+            # Restore original state. Direct assignment (rather than a
+            # conditional "restore only if not None") is correct here:
+            # if the original value was None we must restore it to None,
+            # otherwise the last holdout set's list sticks around and
+            # leaks into any subsequent calibrate() call.
             calibration.excluded_targets = original_state["excluded_targets"]
-            calibration.targets = original_state["targets"]
-            calibration.target_names = original_state["target_names"]
+            calibration.targets = (
+                original_state["targets"].copy()
+                if original_state["targets"] is not None
+                else None
+            )
+            calibration.target_names = (
+                original_state["target_names"].copy()
+                if original_state["target_names"] is not None
+                else None
+            )
             calibration.exclude_targets()
 
     return objective

diff --git a/tests/test_holdout_state_restore.py b/tests/test_holdout_state_restore.py
@@ -0,0 +1,143 @@
+"""Regression tests for holdout state restoration + seed namespace
+(findings #9 and #10).
+
+Finding #9: ``evaluate_holdout_robustness`` (and the analogous tuning
+path) only restored attributes whose captured value was not None, so
+if the caller started with ``excluded_targets=None`` the attribute was
+left at the last holdout set's list, silently leaking that holdout
+into any subsequent ``calibrate()`` call.
+
+Finding #10: tuning called ``_create_holdout_sets(..., seed)`` and
+robustness evaluation called ``_create_holdout_sets(..., seed + 1)``,
+so holdout set #1 of tuning and holdout set #0 of evaluation were
+identical -- reported generalisation accuracy was optimistic by
+construction.
+"""
+
+import numpy as np
+import pandas as pd
+
+from microcalibrate.calibration import Calibration
+
+
+def _make_calibrator(seed: int = 42) -> Calibration:
+    rng = np.random.default_rng(7)
+    n = 150
+    data = pd.DataFrame(
+        {
+            "age": rng.integers(18, 80, size=n),
+            "income": rng.lognormal(10.5, 0.7, size=n),
+            "region": rng.choice(["N", "S", "E", "W"], size=n),
+            "employed": rng.binomial(1, 0.7, size=n),
+        }
+    )
+    weights = rng.uniform(0.5, 1.5, size=n)
+    estimate_matrix = pd.DataFrame(
+        {
+            "total": np.ones(n),
+            "employed": data["employed"].astype(float),
+            "income_n": ((data["region"] == "N") * data["income"]).astype(
+                float
+            ),
+            "income_s": ((data["region"] == "S") * data["income"]).astype(
+                float
+            ),
+            "income_e": ((data["region"] == "E") * data["income"]).astype(
+                float
+            ),
+            "income_w": ((data["region"] == "W") * data["income"]).astype(
+                float
+            ),
+            "young": ((data["age"] < 30) & data["employed"]).astype(float),
+            "senior": (data["age"] >= 65).astype(float),
+        }
+    )
+    targets = np.array(
+        [
+            n * 1.05,
+            (estimate_matrix["employed"] * weights).sum() * 0.95,
+            (estimate_matrix["income_n"] * weights).sum() * 1.1,
+            (estimate_matrix["income_s"] * weights).sum() * 0.9,
+            (estimate_matrix["income_e"] * weights).sum() * 1.05,
+            (estimate_matrix["income_w"] * weights).sum() * 0.98,
+            (estimate_matrix["young"] * weights).sum() * 1.15,
+            (estimate_matrix["senior"] * weights).sum() * 0.92,
+        ]
+    )
+    return Calibration(
+        estimate_matrix=estimate_matrix,
+        weights=weights,
+        targets=targets,
+        noise_level=0.1,
+        epochs=30,
+        learning_rate=0.01,
+        dropout_rate=0,
+        seed=seed,
+    )
+
+
+def test_holdout_restores_excluded_targets_to_none() -> None:
+    """After evaluate_holdout_robustness, a calibrator that started
+    with excluded_targets=None must still have excluded_targets=None.
+    Previously the last holdout set's names were silently kept."""
+    calibrator = _make_calibrator()
+    calibrator.calibrate()
+    assert calibrator.excluded_targets is None
+
+    calibrator.evaluate_holdout_robustness(
+        n_holdout_sets=2,
+        holdout_fraction=0.25,
+    )
+
+    assert calibrator.excluded_targets is None, (
+        "excluded_targets should be restored to None after holdout "
+        f"evaluation; got {calibrator.excluded_targets}."
+    )
+    # target_names/targets should also match the original calibration
+    # surface (all 8 targets, none excluded).
+    assert len(calibrator.target_names) == 8
+    assert len(calibrator.targets) == 8
+
+
+def test_robustness_uses_orthogonal_seed_namespace() -> None:
+    """Finding #10: ``evaluate_holdout_robustness`` must pass an
+    orthogonal seed namespace to ``_create_holdout_sets`` so its
+    holdouts do not match tuning's deterministically.
+
+    Under the old code, tuning called ``_create_holdout_sets(..., seed)``
+    and robustness called ``_create_holdout_sets(..., seed + 1)``, so
+    the RNG for tuning's holdout #1 (rng(seed + 1)) matched robustness
+    holdout #0 (rng((seed + 1) + 0)) exactly. After the fix, robustness
+    uses ``seed + 10_000`` which makes that specific collision
+    impossible. We do not assert "no collision ever" -- with small
+    target counts random overlap is possible -- we assert that the
+    deterministic index-aligned collision no longer fires.
+    """
+    calibrator = _make_calibrator(seed=42)
+
+    # Reproduce the exact call pattern. Tuning passes base seed =
+    # calibrator.seed; _create_holdout_sets iterates
+    # base_seed + i for i in range(n).
+    tuning_sets = calibrator._create_holdout_sets(
+        n_holdout_sets=5,
+        holdout_fraction=0.25,
+        random_state=calibrator.seed,
+    )
+
+    # Robustness MUST use the orthogonal namespace now.
+    robustness_sets = calibrator._create_holdout_sets(
+        n_holdout_sets=5,
+        holdout_fraction=0.25,
+        random_state=calibrator.seed + 10_000,
+    )
+
+    # The specific bug was the index-aligned collision between
+    # tuning[i] and robustness[i-1] via (seed + i) == ((seed + 1) + (i-1)).
+    # The fix makes those aligned pairs differ.
+    for i in range(1, 5):
+        assert set(tuning_sets[i]["indices"]) != set(
+            robustness_sets[i - 1]["indices"]
+        ), (
+            f"Tuning set {i} and robustness set {i - 1} must not be "
+            "identical under the fixed seed namespaces."
+        )
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Fix holdout state restoration and seed-namespace collision. `evaluate_holdout_robustness` and `tune_l0_hyperparameters` only restored captured attributes whose value was not None, so `excluded_targets=None` callers had the last holdout set's target list silently stick to the calibrator. Restoration is now unconditional. Robustness evaluation now uses `seed + 10_000` when generating its holdout sets instead of `seed + 1`, avoiding the deterministic index-aligned collision with tuning's holdout sets that leaked tuning data into "independent" evaluation.