@@ -67,6 +67,14 @@ def read_molecule(
6767 for line in pdbqt_data :
6868 pdb_block += f"{ line [:66 ]} \n "
6969 mol = Chem .MolFromPDBBlock (pdb_block , sanitize = False , removeHs = False )
70+ elif molecule_file .endswith ("_lig.pdb" ):
71+ with open (molecule_file ) as file :
72+ pdb_data = file .readlines ()
73+ pdb_block = ""
74+ for line in pdb_data :
75+ if line .startswith ("HETATM" ):
76+ pdb_block += line
77+ mol = Chem .MolFromPDBBlock (pdb_block , sanitize = False , removeHs = False )
7078 elif molecule_file .endswith (".pdb" ):
7179 mol = Chem .MolFromPDBFile (molecule_file , sanitize = False , removeHs = False )
7280 else :
@@ -99,16 +107,24 @@ def read_mols(
99107 dataset_dir : str ,
100108 name : str ,
101109 remove_hs : bool = False ,
110+ dataset : Optional [str ] = None ,
102111) -> List [Chem .Mol ]:
103112 """Load RDKit `Mol` objects corresponding to a dataset's ligand name."""
104113 ligs = []
105- for file in os .listdir (os .path .join (dataset_dir , name )):
106- if file .endswith ("_ligand.sdf" ) and "rdkit" not in file :
107- lig = read_molecule (
108- os .path .join (dataset_dir , name , file ), remove_hs = remove_hs , sanitize = True
109- )
110- if lig is not None :
111- ligs .append (lig )
114+ if dataset is not None and dataset == "casp15" :
115+ lig = read_molecule (
116+ os .path .join (dataset_dir , f"{ name } _lig.pdb" ), remove_hs = remove_hs , sanitize = True
117+ )
118+ if lig is not None :
119+ ligs .append (lig )
120+ else :
121+ for file in os .listdir (os .path .join (dataset_dir , name )):
122+ if file .endswith ("_ligand.sdf" ) and "rdkit" not in file :
123+ lig = read_molecule (
124+ os .path .join (dataset_dir , name , file ), remove_hs = remove_hs , sanitize = True
125+ )
126+ if lig is not None :
127+ ligs .append (lig )
112128 return ligs
113129
114130
@@ -293,6 +309,7 @@ def get_alignment_rotation(
293309 pdb_id : str ,
294310 dataset_protein_path : str ,
295311 predicted_protein_path : str ,
312+ dataset : str ,
296313 dataset_path : str ,
297314) -> Tuple [Optional [Rotation ], Optional [np .ndarray ], Optional [np .ndarray ]]:
298315 """Calculate the alignment rotation between apo and holo protein structures and their ligand
@@ -304,10 +321,6 @@ def get_alignment_rotation(
304321 structure predictor.
305322 :param dataset: Name of the dataset.
306323 :param dataset_path: Filepath to the PDB file containing ligand coordinates.
307- :param lig_connection_radius: Radius for connecting ligand atoms.
308- :param exclude_af2aa_excluded_ligs: Whether to exclude ligands excluded from the AF2-AA
309- dataset.
310- :param skip_parsed_ligands: Whether to skip parsing ligands if they have already been parsed.
311324 :return: A tuple containing rotation matrix (Optional[Rotation]), centroid of Ca atoms for a
312325 dataset protein (Optional[np.ndarray]), and centroid of Ca atoms for a prediction
313326 (Optional[np.ndarray]).
@@ -326,7 +339,7 @@ def get_alignment_rotation(
326339 f"Unable to parse predicted protein structure for PDB ID { pdb_id } due to the error: { e } . Skipping..."
327340 )
328341 return None , None , None
329- dataset_ligand = read_mols (dataset_path , pdb_id , remove_hs = True )[0 ]
342+ dataset_ligand = read_mols (dataset_path , pdb_id , remove_hs = True , dataset = dataset )[0 ]
330343
331344 try :
332345 dataset_calpha_coords = extract_receptor_structure (
@@ -391,10 +404,24 @@ def align_apo_structure_to_holo_structure(
391404 :param filename: Filename of the predicted apo structure.
392405 :param atom_df_name: Name of the atom DataFrame derived from the corresponding PDB file input.
393406 """
394- pdb_id = "_" .join (Path (filename ).stem .split ("_" )[:2 ])
407+ pdb_id = Path (filename ).stem
408+ data_dir = cfg .data_dir
409+
410+ processed_protein_suffix = "_protein"
411+ if cfg .dataset == "dockgen" :
412+ processed_protein_suffix = "_protein_processed"
413+ elif cfg .dataset == "casp15" :
414+ processed_protein_suffix = "_lig"
415+ data_dir = os .path .join (data_dir , "targets" )
416+
417+ processed_protein_name = f"{ pdb_id } { processed_protein_suffix } .pdb"
418+ processed_protein_filename = (
419+ os .path .join (data_dir , processed_protein_name )
420+ if cfg .dataset == "casp15"
421+ else os .path .join (data_dir , pdb_id , processed_protein_name )
422+ )
423+
395424 predicted_protein_filename = os .path .join (cfg .predicted_structures_dir , f"{ pdb_id } .pdb" )
396- processed_protein_name = f"{ pdb_id } _protein.pdb"
397- processed_protein_filename = os .path .join (cfg .data_dir , pdb_id , processed_protein_name )
398425 predicted_protein_output_filename = os .path .join (
399426 cfg .output_dir , f"{ pdb_id } _holo_aligned_predicted_protein.pdb"
400427 )
@@ -403,7 +430,8 @@ def align_apo_structure_to_holo_structure(
403430 pdb_id = pdb_id ,
404431 dataset_protein_path = processed_protein_filename ,
405432 predicted_protein_path = predicted_protein_filename ,
406- dataset_path = cfg .data_dir ,
433+ dataset = cfg .dataset ,
434+ dataset_path = data_dir ,
407435 )
408436
409437 if any (
@@ -437,7 +465,7 @@ def main(cfg: DictConfig):
437465
438466 :param cfg: Hydra config for the alignments.
439467 """
440- if cfg .dataset not in ["posebusters_benchmark" , "astex_diverse" ]:
468+ if cfg .dataset not in ["posebusters_benchmark" , "astex_diverse" , "dockgen" , "casp15" ]:
441469 raise ValueError (f"Dataset { cfg .dataset } is not supported." )
442470 output_dir = cfg .output_dir
443471 os .makedirs (output_dir , exist_ok = True )
0 commit comments