himjl · himjl · Mar 26, 2026 · Mar 25, 2026 · Mar 25, 2026 · Mar 25, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -5,6 +5,7 @@ on:
     paths:
       # Changes to source code
       - hobj/**
+      - test/**.py
       # Changes to workflows
       - .github/workflows/ci.yml
       # Changes to project/dependency metadata

diff --git a/examples/documentation.ipynb b/examples/documentation.ipynb
diff --git a/examples/score_example_models.ipynb b/examples/score_example_models.ipynb
diff --git a/examples/view_experiment1_behavior.ipynb b/examples/view_experiment1_behavior.ipynb
diff --git a/examples/view_experiment2_behavior.ipynb b/examples/view_experiment2_behavior.ipynb
diff --git a/hobj/__init__.py b/hobj/__init__.py
@@ -1,2 +1,28 @@
-import hobj.benchmarks as benchmarks
-import hobj.learning_models as learning_models
+from hobj.benchmarks.mut_oneshot_benchmark import MutatorOneshotBenchmark
+from hobj.benchmarks.mut_highvar_benchmark import MutatorHighVarBenchmark
+
+from hobj.data_loaders.behavior import load_highvar_behavior, load_oneshot_behavior
+
+from hobj.data_loaders.images import (
+    load_image,
+    load_imageset_meta_highvar,
+    load_imageset_meta_oneshot,
+    load_imageset_meta_warmup,
+    load_imageset_meta_catch,
+)
+
+__all__ = [
+    # Raw behavior loaders
+    "load_highvar_behavior",
+    "load_oneshot_behavior",
+    # Image meta loaders:
+    "load_imageset_meta_highvar",
+    "load_imageset_meta_oneshot",
+    "load_imageset_meta_warmup",
+    "load_imageset_meta_catch",
+    # Image loader:
+    "load_image",
+    # Benchmarks:
+    "MutatorHighVarBenchmark",
+    "MutatorOneshotBenchmark",
+]
diff --git a/hobj/benchmarks/__init__.py b/hobj/benchmarks/__init__.py
@@ -1,2 +0,0 @@
-from hobj.benchmarks.mut_highvar_benchmark import MutatorHighVarBenchmark
-from hobj.benchmarks.mut_oneshot_benchmark import MutatorOneshotBenchmark

diff --git a/hobj/benchmarks/binary_classification/benchmark.py b/hobj/benchmarks/binary_classification/benchmark.py
@@ -7,71 +7,84 @@
 from tqdm import tqdm
 
 from hobj.benchmarks.binary_classification.estimator import LearningCurveStatistics
-from hobj.benchmarks.binary_classification.simulation import BinaryClassificationSubtask, BinaryClassificationSubtaskResult
+from hobj.benchmarks.binary_classification.simulation import (
+    BinaryClassificationSubtask,
+    BinaryClassificationSubtaskResult,
+)
 from hobj.learning_models import BinaryLearningModel
 from hobj.stats.ci import estimate_basic_bootstrap_CI
 
 
 # %% Models for configuring a LearningCurveBenchmark:
 class TargetSubtaskData(pydantic.BaseModel):
     subtask: BinaryClassificationSubtask  # The subtask which generated the associated results
-    results: List[BinaryClassificationSubtaskResult]  # [session, trial] boolean matrix of performance
+    results: List[
+        BinaryClassificationSubtaskResult
+    ]  # [session, trial] boolean matrix of performance
 
-    model_config = dict(
-        arbitrary_types_allowed=True
-    )
+    model_config = dict(arbitrary_types_allowed=True)
 
-    @pydantic.model_validator(mode='after')
-    def validate_results(self) -> 'TargetSubtaskData':
+    @pydantic.model_validator(mode="after")
+    def validate_results(self) -> "TargetSubtaskData":
         # Check shape
         for result in self.results:
             if self.subtask.ntrials != len(result.perf_seq):
-                raise ValueError(f"Expected {self.subtask.ntrials} trials, but got {result.perf_seq} trials")
+                raise ValueError(
+                    f"Expected {self.subtask.ntrials} trials, but got {result.perf_seq} trials"
+                )
 
         return self
 
 
 class LearningCurveBenchmarkConfig(pydantic.BaseModel):
-    subtask_name_to_data: Dict[str, 'TargetSubtaskData'] = pydantic.Field(default_factory=dict, description="A dictionary of subtask_name -> TargetSubtaskConfig")
+    subtask_name_to_data: Dict[str, "TargetSubtaskData"] = pydantic.Field(
+        default_factory=dict,
+        description="A dictionary of subtask_name -> TargetSubtaskConfig",
+    )
     num_simulations_per_subtask: int = pydantic.Field(ge=2)
     num_bootstrap_samples: int = pydantic.Field(ge=2)
     bootstrap_by_worker: bool
     ntrials: Optional[int] = pydantic.Field(default=None)
 
-    @pydantic.model_validator(mode='after')
-    def ensure_rectangular(self) -> 'LearningCurveBenchmarkConfig':
-
+    @pydantic.model_validator(mode="after")
+    def ensure_rectangular(self) -> "LearningCurveBenchmarkConfig":
         ntrials_observed = set()
         for name, data in self.subtask_name_to_data.items():
             ntrials_observed.add(data.subtask.ntrials)
 
         if not len(ntrials_observed) == 1:
-            raise ValueError(f"Expected all subtasks to have the same number of trials, but got {ntrials_observed}")
+            raise ValueError(
+                f"Expected all subtasks to have the same number of trials, but got {ntrials_observed}"
+            )
 
         if self.ntrials is not None:
             if self.ntrials != ntrials_observed.pop():
-                raise ValueError(f"Expected ntrials to be {ntrials_observed.pop()}, but got {self.ntrials}")
+                raise ValueError(
+                    f"Expected ntrials to be {ntrials_observed.pop()}, but got {self.ntrials}"
+                )
         else:
             self.ntrials = ntrials_observed.pop()
         return self
 
 
 # %%
 class LearningCurveBenchmark:
-
     def __init__(
-            self,
-            config: LearningCurveBenchmarkConfig,
+        self,
+        config: LearningCurveBenchmarkConfig,
     ):
         self.config = config
 
         # Attach properties
         self.subtask_names = sorted(config.subtask_name_to_data.keys())
         self.subtask_name_to_subtask: Dict[str, BinaryClassificationSubtask] = {
-            name: config.subtask_name_to_data[name].subtask for name in self.subtask_names
+            name: config.subtask_name_to_data[name].subtask
+            for name in self.subtask_names
         }
 
-        self.subtask_name_to_results: Dict[str, List[BinaryClassificationSubtaskResult]] = {}
+        self.subtask_name_to_results: Dict[
+            str, List[BinaryClassificationSubtaskResult]
+        ] = {}
         self._target_data = {}
 
         for name in self.subtask_names:
@@ -82,8 +95,12 @@ def __init__(
             for result in results:
                 worker_id = result.worker_id
                 if worker_id in self._target_data[name]:
-                    raise ValueError(f"Worker {worker_id} has already been seen for subtask {name}")
-                self._target_data[name][result.worker_id] = list([bool(v) for v in result.perf_seq])
+                    raise ValueError(
+                        f"Worker {worker_id} has already been seen for subtask {name}"
+                    )
+                self._target_data[name][result.worker_id] = list(
+                    [bool(v) for v in result.perf_seq]
+                )
 
         self._target_statistics = LearningCurveStatistics(
             subtask_name_to_results=self.subtask_name_to_results,
@@ -95,7 +112,6 @@ def __init__(
     def target_data(self) -> Dict[str, Dict[str, List[bool]]]:
         return self._target_data
 
-
     @property
     def target_statistics(self) -> LearningCurveStatistics:
         """
@@ -115,18 +131,20 @@ class LearningCurveBenchmarkResult:
         model_statistics: LearningCurveStatistics
 
     def __call__(
-            self,
-            learner: BinaryLearningModel,
-            show_pbar: bool = False
+        self, learner: BinaryLearningModel, show_pbar: bool = False
     ) -> LearningCurveBenchmarkResult:
         """
         :param learner: LearningModel
         :return:
         """
 
         # Get model learning curve statistics:
-        subtask_name_to_model_results: Dict[str, List[BinaryClassificationSubtaskResult]] = {}
-        for i_subtask, subtask_name in enumerate(tqdm(self.subtask_names, desc='Subtask simulations:', disable=not show_pbar)):
+        subtask_name_to_model_results: Dict[
+            str, List[BinaryClassificationSubtaskResult]
+        ] = {}
+        for i_subtask, subtask_name in enumerate(
+            tqdm(self.subtask_names, desc="Subtask simulations:", disable=not show_pbar)
+        ):
             # Get [simulation, trial] boolean performance matrix for the model
             subtask_results = self.simulate_model_behavior(
                 subtask=self.subtask_name_to_subtask[subtask_name],
@@ -149,7 +167,7 @@ def __call__(
             model_varhat_phat=model_statistics.varhat_phat,
             target_phat=self.target_statistics.phat,
             target_varhat_phat=self.target_statistics.varhat_phat,
-            condition_dims=('subtask', 'trial'),
+            condition_dims=("subtask", "trial"),
             fit_lapse_rate=True,
         )
 
@@ -159,7 +177,7 @@ def __call__(
             model_varhat_phat=model_statistics.boot_varhat_phat,
             target_phat=self.target_statistics.boot_phat,
             target_varhat_phat=self.target_statistics.boot_varhat_phat,
-            condition_dims=('subtask', 'trial'),
+            condition_dims=("subtask", "trial"),
             fit_lapse_rate=True,
         )
 
@@ -182,9 +200,9 @@ def __call__(
 
     @staticmethod
     def simulate_model_behavior(
-            subtask: BinaryClassificationSubtask,
-            learner: BinaryLearningModel,
-            nsimulations: int,
+        subtask: BinaryClassificationSubtask,
+        learner: BinaryLearningModel,
+        nsimulations: int,
     ) -> List[BinaryClassificationSubtaskResult]:
         """
         Returns a [nsimulations, ntrials] matrix of model performance on the subtask.
@@ -213,34 +231,33 @@ def simulate_model_behavior(
 
     @classmethod
     def _compare_learning_curves(
-            cls,
-            model_phat: xr.DataArray,
-            model_varhat_phat: xr.DataArray,
-            target_phat: xr.DataArray,
-            target_varhat_phat: xr.DataArray,
-            condition_dims: Tuple[str, ...],
-            fit_lapse_rate: bool
+        cls,
+        model_phat: xr.DataArray,
+        model_varhat_phat: xr.DataArray,
+        target_phat: xr.DataArray,
+        target_varhat_phat: xr.DataArray,
+        condition_dims: Tuple[str, ...],
+        fit_lapse_rate: bool,
     ) -> Tuple[Union[np.ndarray, np.generic], Union[xr.DataArray, None]]:
-
         if fit_lapse_rate:
             lapse_rate = cls._fit_lapse_rate(
-                pmodel=model_phat,
-                ptarget=target_phat,
-                condition_dims=condition_dims
+                pmodel=model_phat, ptarget=target_phat, condition_dims=condition_dims
             )
             model_phat = model_phat * (1 - lapse_rate) + 0.5 * lapse_rate
             model_varhat_phat = model_varhat_phat * (1 - lapse_rate) ** 2
         else:
             lapse_rate = None
 
-        msen = np.square(model_phat - target_phat).mean(condition_dims) - model_varhat_phat.mean(condition_dims) - target_varhat_phat.mean(condition_dims)
+        msen = (
+            np.square(model_phat - target_phat).mean(condition_dims)
+            - model_varhat_phat.mean(condition_dims)
+            - target_varhat_phat.mean(condition_dims)
+        )
         return msen, lapse_rate
 
     @staticmethod
     def _fit_lapse_rate(
-            pmodel: xr.DataArray,
-            ptarget: xr.DataArray,
-            condition_dims: Tuple[str, ...]
+        pmodel: xr.DataArray, ptarget: xr.DataArray, condition_dims: Tuple[str, ...]
     ) -> Union[np.ndarray, np.generic]:
         """
         Fits a "lapse rate" parameter (gamma), which takes on values between [0, 1]. It may be interpreted
@@ -257,8 +274,15 @@ def _fit_lapse_rate(
         """
 
         nway = 2
-        numerator = -(2 * pmodel / nway - 2 * np.square(pmodel) + 2 * pmodel * ptarget - 2 * ptarget / nway).sum(dim=condition_dims)
-        denominator = (2 / (nway ** 2) - 4 * pmodel / nway + 2 * (pmodel ** 2)).sum(dim=condition_dims)
+        numerator = -(
+            2 * pmodel / nway
+            - 2 * np.square(pmodel)
+            + 2 * pmodel * ptarget
+            - 2 * ptarget / nway
+        ).sum(dim=condition_dims)
+        denominator = (2 / (nway**2) - 4 * pmodel / nway + 2 * (pmodel**2)).sum(
+            dim=condition_dims
+        )
         gamma_star = numerator / denominator
         gamma_star = np.clip(gamma_star, 0, 1)
         return gamma_star
Original file line number	Diff line number	Diff line change
		@@ -1,2 +0,0 @@
		from hobj.benchmarks.mut_highvar_benchmark import MutatorHighVarBenchmark
		from hobj.benchmarks.mut_oneshot_benchmark import MutatorOneshotBenchmark