Skip to content

Commit 65b0f11

Browse files
committed
avoid repeatition of smiles-mol conv
1 parent 8ec94bb commit 65b0f11

File tree

2 files changed

+25
-9
lines changed

2 files changed

+25
-9
lines changed

chebai/preprocessing/reader.py

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -199,24 +199,25 @@ def _read_data(self, raw_data: str) -> List[int]:
199199
Returns:
200200
List[int]: A list of integers representing the indices of the SMILES tokens.
201201
"""
202-
if self.canonicalize_smiles:
203-
try:
204-
mol = Chem.MolFromSmiles(raw_data.strip())
205-
if mol is not None:
206-
raw_data = Chem.MolToSmiles(mol, canonical=True)
207-
except Exception as e:
208-
print(f"RDKit failed to canonicalize the SMILES: {raw_data}")
209-
print(f"\t{e}")
210202
try:
211203
mol = Chem.MolFromSmiles(raw_data.strip())
212204
if mol is None:
213205
raise ValueError(f"Invalid SMILES: {raw_data}")
214-
return [self._get_token_index(v[1]) for v in _tokenize(raw_data)]
215206
except ValueError as e:
216207
print(f"could not process {raw_data}")
217208
print(f"\tError: {e}")
218209
return None
219210

211+
if self.canonicalize_smiles:
212+
try:
213+
raw_data = Chem.MolToSmiles(mol, canonical=True)
214+
except Exception as e:
215+
print(f"RDKit failed to canonicalize the SMILES: {raw_data}")
216+
print(f"\t{e}")
217+
return None
218+
219+
return [self._get_token_index(v[1]) for v in _tokenize(raw_data)]
220+
220221
def _back_to_smiles(self, smiles_encoded):
221222
token_file = self.reader.token_path
222223
token_coding = {}

tests/unit/readers/testChemDataReader.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,21 @@ def test_read_data_with_invalid_input(self) -> None:
111111
f"The output for invalid token '{raw_data}' should be None.",
112112
)
113113

114+
def test_read_data_with_invalid_input_with_no_canonicalize(self) -> None:
115+
"""
116+
Test the _read_data method with an invalid input.
117+
The invalid token should prompt a return value None
118+
"""
119+
self.reader.canonicalize_smiles = False
120+
raw_datas = ["%INVALID%", "ADADAD", "ADASDAD", "CC(=O)NC1[Mg-2]"]
121+
for raw_data in raw_datas:
122+
result = self.reader._read_data(raw_data)
123+
self.assertIsNone(
124+
result,
125+
f"The output for invalid token '{raw_data}' should be None.",
126+
)
127+
self.reader.canonicalize_smiles = True # Reset to original state
128+
114129
@patch("builtins.open", new_callable=mock_open)
115130
def test_finish_method_for_new_tokens(self, mock_file: mock_open) -> None:
116131
"""

0 commit comments

Comments
 (0)