From cec2eb8e377834be3aa5447f37c1364bbd02776a Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 17 Feb 2026 20:39:28 +0000
Subject: [PATCH 1/2] Initial plan


From ccd7da6b3f53252103b6cf7cb189c84c5ac0c349 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 17 Feb 2026 20:45:32 +0000
Subject: [PATCH 2/2] Move jsonargparse to main dependencies and implement lazy
 imports for sklearn/scipy

Co-authored-by: aditya0by0 <65857172+aditya0by0@users.noreply.github.com>
---
 chebai/preprocessing/datasets/pubchem.py | 13 ++++++++++---
 pyproject.toml                           |  2 +-
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/chebai/preprocessing/datasets/pubchem.py b/chebai/preprocessing/datasets/pubchem.py
index 8cc208b9..db6e1459 100644
--- a/chebai/preprocessing/datasets/pubchem.py
+++ b/chebai/preprocessing/datasets/pubchem.py
@@ -23,9 +23,6 @@
 import tqdm
 from rdkit import Chem, DataStructs
 from rdkit.Chem import AllChem
-from scipy import spatial
-from sklearn.cluster import KMeans
-from sklearn.model_selection import train_test_split
 
 from chebai.preprocessing import reader as dr
 from chebai.preprocessing.datasets.base import DataLoader, XYBaseDataModule
@@ -150,6 +147,8 @@ def setup_processed(self):
         """
         Prepares processed data and saves them as Torch tensors.
         """
+        from sklearn.model_selection import train_test_split
+
         filename = os.path.join(self.raw_dir, self.raw_file_names[0])
         print("Load data from file", filename)
         data = self._load_data_from_file(filename)
@@ -294,6 +293,8 @@ def setup_processed(self):
         """
         Prepares processed data and saves them as Torch tensors.
         """
+        from sklearn.model_selection import train_test_split
+
         filename = os.path.join(self.raw_dir, self.raw_file_names[0])
         print("Load data from file", filename)
         data_not_tokenized = [entry for entry in self._load_dict(filename)]
@@ -557,6 +558,8 @@ def _build_clusters(self) -> tuple[pd.DataFrame, pd.DataFrame]:
         Returns:
             tuple: Tuple containing cluster centers DataFrame and clustered fingerprints DataFrame.
         """
+        from sklearn.cluster import KMeans
+
         fingerprints_clustered_path = os.path.join(
             self.raw_dir, "fingerprints_clustered.pkl"
         )
@@ -603,6 +606,8 @@ def _exclude_clusters(self, cluster_centers: pd.DataFrame) -> pd.DataFrame:
         Returns:
             pd.DataFrame: DataFrame of filtered cluster centers.
         """
+        from scipy import spatial
+
         exclusion_data_path = os.path.join(self.raw_dir, "exclusion_data_clustered.pkl")
         cluster_centers_np = np.array(
             [
@@ -701,6 +706,8 @@ def cluster_centers_superclustered(self) -> pd.DataFrame:
         Returns:
             pd.DataFrame: DataFrame of superclustered cluster centers.
         """
+        from sklearn.cluster import KMeans
+
         cluster_centers_path = os.path.join(
             self.raw_dir, "cluster_centers_superclustered.pkl"
         )
diff --git a/pyproject.toml b/pyproject.toml
index b3652b00..a4953bd0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -23,6 +23,7 @@ dependencies = [
     "pysmiles==1.1.2",
     "rdkit==2024.3.6",
     "lightning==2.5.1",
+    "jsonargparse[signatures]>=4.17",
 ]
 
 [project.optional-dependencies]
@@ -36,7 +37,6 @@ dev = [
     "scipy",
     "fastobo",
     "selfies",
-    "jsonargparse[signatures]>=4.17",
     "omegaconf",
     "deepsmiles",
     "iterative-stratification",