Skip to content

Commit 2e3394d

Browse files
committed
Add CASP15 support for apo-to-holo alignment
1 parent 943d8e0 commit 2e3394d

File tree

1 file changed

+45
-17
lines changed

1 file changed

+45
-17
lines changed

posebench/data/components/protein_apo_to_holo_alignment.py

Lines changed: 45 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,14 @@ def read_molecule(
6767
for line in pdbqt_data:
6868
pdb_block += f"{line[:66]}\n"
6969
mol = Chem.MolFromPDBBlock(pdb_block, sanitize=False, removeHs=False)
70+
elif molecule_file.endswith("_lig.pdb"):
71+
with open(molecule_file) as file:
72+
pdb_data = file.readlines()
73+
pdb_block = ""
74+
for line in pdb_data:
75+
if line.startswith("HETATM"):
76+
pdb_block += line
77+
mol = Chem.MolFromPDBBlock(pdb_block, sanitize=False, removeHs=False)
7078
elif molecule_file.endswith(".pdb"):
7179
mol = Chem.MolFromPDBFile(molecule_file, sanitize=False, removeHs=False)
7280
else:
@@ -99,16 +107,24 @@ def read_mols(
99107
dataset_dir: str,
100108
name: str,
101109
remove_hs: bool = False,
110+
dataset: Optional[str] = None,
102111
) -> List[Chem.Mol]:
103112
"""Load RDKit `Mol` objects corresponding to a dataset's ligand name."""
104113
ligs = []
105-
for file in os.listdir(os.path.join(dataset_dir, name)):
106-
if file.endswith("_ligand.sdf") and "rdkit" not in file:
107-
lig = read_molecule(
108-
os.path.join(dataset_dir, name, file), remove_hs=remove_hs, sanitize=True
109-
)
110-
if lig is not None:
111-
ligs.append(lig)
114+
if dataset is not None and dataset == "casp15":
115+
lig = read_molecule(
116+
os.path.join(dataset_dir, f"{name}_lig.pdb"), remove_hs=remove_hs, sanitize=True
117+
)
118+
if lig is not None:
119+
ligs.append(lig)
120+
else:
121+
for file in os.listdir(os.path.join(dataset_dir, name)):
122+
if file.endswith("_ligand.sdf") and "rdkit" not in file:
123+
lig = read_molecule(
124+
os.path.join(dataset_dir, name, file), remove_hs=remove_hs, sanitize=True
125+
)
126+
if lig is not None:
127+
ligs.append(lig)
112128
return ligs
113129

114130

@@ -293,6 +309,7 @@ def get_alignment_rotation(
293309
pdb_id: str,
294310
dataset_protein_path: str,
295311
predicted_protein_path: str,
312+
dataset: str,
296313
dataset_path: str,
297314
) -> Tuple[Optional[Rotation], Optional[np.ndarray], Optional[np.ndarray]]:
298315
"""Calculate the alignment rotation between apo and holo protein structures and their ligand
@@ -304,10 +321,6 @@ def get_alignment_rotation(
304321
structure predictor.
305322
:param dataset: Name of the dataset.
306323
:param dataset_path: Filepath to the PDB file containing ligand coordinates.
307-
:param lig_connection_radius: Radius for connecting ligand atoms.
308-
:param exclude_af2aa_excluded_ligs: Whether to exclude ligands excluded from the AF2-AA
309-
dataset.
310-
:param skip_parsed_ligands: Whether to skip parsing ligands if they have already been parsed.
311324
:return: A tuple containing rotation matrix (Optional[Rotation]), centroid of Ca atoms for a
312325
dataset protein (Optional[np.ndarray]), and centroid of Ca atoms for a prediction
313326
(Optional[np.ndarray]).
@@ -326,7 +339,7 @@ def get_alignment_rotation(
326339
f"Unable to parse predicted protein structure for PDB ID {pdb_id} due to the error: {e}. Skipping..."
327340
)
328341
return None, None, None
329-
dataset_ligand = read_mols(dataset_path, pdb_id, remove_hs=True)[0]
342+
dataset_ligand = read_mols(dataset_path, pdb_id, remove_hs=True, dataset=dataset)[0]
330343

331344
try:
332345
dataset_calpha_coords = extract_receptor_structure(
@@ -391,10 +404,24 @@ def align_apo_structure_to_holo_structure(
391404
:param filename: Filename of the predicted apo structure.
392405
:param atom_df_name: Name of the atom DataFrame derived from the corresponding PDB file input.
393406
"""
394-
pdb_id = "_".join(Path(filename).stem.split("_")[:2])
407+
pdb_id = Path(filename).stem
408+
data_dir = cfg.data_dir
409+
410+
processed_protein_suffix = "_protein"
411+
if cfg.dataset == "dockgen":
412+
processed_protein_suffix = "_protein_processed"
413+
elif cfg.dataset == "casp15":
414+
processed_protein_suffix = "_lig"
415+
data_dir = os.path.join(data_dir, "targets")
416+
417+
processed_protein_name = f"{pdb_id}{processed_protein_suffix}.pdb"
418+
processed_protein_filename = (
419+
os.path.join(data_dir, processed_protein_name)
420+
if cfg.dataset == "casp15"
421+
else os.path.join(data_dir, pdb_id, processed_protein_name)
422+
)
423+
395424
predicted_protein_filename = os.path.join(cfg.predicted_structures_dir, f"{pdb_id}.pdb")
396-
processed_protein_name = f"{pdb_id}_protein.pdb"
397-
processed_protein_filename = os.path.join(cfg.data_dir, pdb_id, processed_protein_name)
398425
predicted_protein_output_filename = os.path.join(
399426
cfg.output_dir, f"{pdb_id}_holo_aligned_predicted_protein.pdb"
400427
)
@@ -403,7 +430,8 @@ def align_apo_structure_to_holo_structure(
403430
pdb_id=pdb_id,
404431
dataset_protein_path=processed_protein_filename,
405432
predicted_protein_path=predicted_protein_filename,
406-
dataset_path=cfg.data_dir,
433+
dataset=cfg.dataset,
434+
dataset_path=data_dir,
407435
)
408436

409437
if any(
@@ -437,7 +465,7 @@ def main(cfg: DictConfig):
437465
438466
:param cfg: Hydra config for the alignments.
439467
"""
440-
if cfg.dataset not in ["posebusters_benchmark", "astex_diverse"]:
468+
if cfg.dataset not in ["posebusters_benchmark", "astex_diverse", "dockgen", "casp15"]:
441469
raise ValueError(f"Dataset {cfg.dataset} is not supported.")
442470
output_dir = cfg.output_dir
443471
os.makedirs(output_dir, exist_ok=True)

0 commit comments

Comments
 (0)