1- from foldtree2 .src . converter import pdbgraph
1+ from foldtree2 .src import pdbgraph
22import os
3+ import glob
34import torch
5+ import numpy as np
46import argparse
57
68# command line arguments are an input directory with pdbs, a model file and an output directory
79if __name__ == '__main__' :
8- converter = pdbgraph .PDB2PyG ()import glob
9- #set device to gpu if available
10+ # Setting the seed for reproducibility
11+ torch .manual_seed (0 )
12+ np .random .seed (0 )
13+ torch .backends .cudnn .deterministic = True
14+ torch .backends .cudnn .benchmark = False
15+
16+ # Initialize converter with config
17+ converter = pdbgraph .PDB2PyG (
18+ aapropcsv = 'foldtree2/config/aaindex1.csv'
19+ )
20+
21+ # Set device to gpu if available
1022 device = torch .device ('cuda' if torch .cuda .is_available () else 'cpu' )
23+
1124 parser = argparse .ArgumentParser (description = 'Encode pdbs' )
1225 parser .add_argument ('input_dir' , type = str , help = 'Input directory with pdbs' )
13- parser .add_argument ('input_glob' , type = str , help = 'Input directory with pdbs' )
26+ parser .add_argument ('input_glob' , type = str , help = 'Glob pattern for input pdbs' )
1427 parser .add_argument ('output_h5' , type = str , help = 'Output file with pytorch geometric graphs of pdbs' )
15- parser .add_argument ( 'foldxdir' , type = str , help = 'foldx directory with foldx output for all pdbs' )
28+ parser .add_argument ('foldxdir' , type = str , nargs = '?' , default = None , help = 'foldx directory with foldx output for all pdbs' )
29+ parser .add_argument ('--distance' , type = float , default = 15 , help = 'Distance threshold for contact map (default: 15)' )
30+ parser .add_argument ('--add-prody' , action = 'store_true' , default = True , help = 'Add ProDy features (default: True)' )
31+ parser .add_argument ('--verbose' , action = 'store_true' , default = False , help = 'Verbose output' )
32+ parser .add_argument ('--multiprocessing' , action = 'store_true' , default = False , help = 'Use multiprocessing for parallel processing' )
33+ parser .add_argument ('--ncpu' , type = int , default = 25 , help = 'Number of CPUs for multiprocessing (default: 25)' )
1634
17- #add help for the arguments
35+ # Add help for the arguments
1836 parser .description = "Encode PDB files into PyTorch geometric graphs with optional FoldX data integration."
1937 parser .epilog = ("Example usage:\n "
2038 " python encode_pdbs.py /path/to/pdbs '*.pdb' output.h5 /path/to/foldx" )
2139
2240 args = parser .parse_args ()
2341
2442 if args .input_glob :
25- files = glob .glob ( args .input_glob )
43+ files = glob .glob (args .input_glob )
2644 else :
2745 files = glob .glob (os .path .join (args .input_dir , '*.pdb' ))
46+
47+ # Shuffle the data for randomization
48+ np .random .shuffle (files )
49+
2850 output_h5 = args .output_h5
29- if args .foldxdir :
30- foldx = args .foldxdir
51+ foldx = args .foldxdir
52+
53+ # Create h5 dataset with pytorch geometric graphs
54+ # Using the same parameters as in the notebook
55+ if args .multiprocessing :
56+ converter .store_pyg_mp (
57+ files ,
58+ filename = output_h5 ,
59+ foldxdir = foldx ,
60+ verbose = args .verbose ,
61+ add_prody = args .add_prody ,
62+ ncpu = args .ncpu
63+ )
3164 else :
32- foldx = None
33- #create h5 dataset with pytorch geometric graphs
34- converter .store_pyg (pdbfiles , filename = output_h5 , foldxdir = foldx , verbose = False )
65+ converter .store_pyg (
66+ files ,
67+ filename = output_h5 ,
68+ foldxdir = foldx ,
69+ verbose = args .verbose ,
70+ add_prody = args .add_prody ,
71+ distance = args .distance
72+ )
0 commit comments