avoid repeatition of smiles-mol conv

aditya0by0 · aditya0by0 · commit 65b0f112e522 · 2026-01-22T16:48:04.000+01:00
diff --git a/chebai/preprocessing/reader.py b/chebai/preprocessing/reader.py
@@ -199,24 +199,25 @@ def _read_data(self, raw_data: str) -> List[int]:
         Returns:
             List[int]: A list of integers representing the indices of the SMILES tokens.
         """
-        if self.canonicalize_smiles:
-            try:
-                mol = Chem.MolFromSmiles(raw_data.strip())
-                if mol is not None:
-                    raw_data = Chem.MolToSmiles(mol, canonical=True)
-            except Exception as e:
-                print(f"RDKit failed to canonicalize the SMILES: {raw_data}")
-                print(f"\t{e}")
         try:
             mol = Chem.MolFromSmiles(raw_data.strip())
             if mol is None:
                 raise ValueError(f"Invalid SMILES: {raw_data}")
-            return [self._get_token_index(v[1]) for v in _tokenize(raw_data)]
         except ValueError as e:
             print(f"could not process {raw_data}")
             print(f"\tError: {e}")
             return None
 
+        if self.canonicalize_smiles:
+            try:
+                raw_data = Chem.MolToSmiles(mol, canonical=True)
+            except Exception as e:
+                print(f"RDKit failed to canonicalize the SMILES: {raw_data}")
+                print(f"\t{e}")
+                return None
+
+        return [self._get_token_index(v[1]) for v in _tokenize(raw_data)]
+
     def _back_to_smiles(self, smiles_encoded):
         token_file = self.reader.token_path
         token_coding = {}
diff --git a/tests/unit/readers/testChemDataReader.py b/tests/unit/readers/testChemDataReader.py
@@ -111,6 +111,21 @@ def test_read_data_with_invalid_input(self) -> None:
                 f"The output for invalid token '{raw_data}' should be None.",
             )
 
+    def test_read_data_with_invalid_input_with_no_canonicalize(self) -> None:
+        """
+        Test the _read_data method with an invalid input.
+        The invalid token should prompt a return value None
+        """
+        self.reader.canonicalize_smiles = False
+        raw_datas = ["%INVALID%", "ADADAD", "ADASDAD", "CC(=O)NC1[Mg-2]"]
+        for raw_data in raw_datas:
+            result = self.reader._read_data(raw_data)
+            self.assertIsNone(
+                result,
+                f"The output for invalid token '{raw_data}' should be None.",
+            )
+        self.reader.canonicalize_smiles = True  # Reset to original state
+
     @patch("builtins.open", new_callable=mock_open)
     def test_finish_method_for_new_tokens(self, mock_file: mock_open) -> None:
         """