Skip to content

Commit 5343e3e

Browse files
author
dmoi
committed
dataset compilation
1 parent 137c78b commit 5343e3e

File tree

3 files changed

+3867
-1150
lines changed

3 files changed

+3867
-1150
lines changed

foldtree2/encode_pdbs.py

Lines changed: 50 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,34 +1,72 @@
1-
from foldtree2.src.converter import pdbgraph
1+
from foldtree2.src import pdbgraph
22
import os
3+
import glob
34
import torch
5+
import numpy as np
46
import argparse
57

68
# command line arguments are an input directory with pdbs, a model file and an output directory
79
if __name__ == '__main__':
8-
converter = pdbgraph.PDB2PyG()import glob
9-
#set device to gpu if available
10+
# Setting the seed for reproducibility
11+
torch.manual_seed(0)
12+
np.random.seed(0)
13+
torch.backends.cudnn.deterministic = True
14+
torch.backends.cudnn.benchmark = False
15+
16+
# Initialize converter with config
17+
converter = pdbgraph.PDB2PyG(
18+
aapropcsv='foldtree2/config/aaindex1.csv'
19+
)
20+
21+
# Set device to gpu if available
1022
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
23+
1124
parser = argparse.ArgumentParser(description='Encode pdbs')
1225
parser.add_argument('input_dir', type=str, help='Input directory with pdbs')
13-
parser.add_argument('input_glob', type=str, help='Input directory with pdbs')
26+
parser.add_argument('input_glob', type=str, help='Glob pattern for input pdbs')
1427
parser.add_argument('output_h5', type=str, help='Output file with pytorch geometric graphs of pdbs')
15-
parser.add_argument( 'foldxdir' , type = str , help = 'foldx directory with foldx output for all pdbs')
28+
parser.add_argument('foldxdir', type=str, nargs='?', default=None, help='foldx directory with foldx output for all pdbs')
29+
parser.add_argument('--distance', type=float, default=15, help='Distance threshold for contact map (default: 15)')
30+
parser.add_argument('--add-prody', action='store_true', default=True, help='Add ProDy features (default: True)')
31+
parser.add_argument('--verbose', action='store_true', default=False, help='Verbose output')
32+
parser.add_argument('--multiprocessing', action='store_true', default=False, help='Use multiprocessing for parallel processing')
33+
parser.add_argument('--ncpu', type=int, default=25, help='Number of CPUs for multiprocessing (default: 25)')
1634

17-
#add help for the arguments
35+
# Add help for the arguments
1836
parser.description = "Encode PDB files into PyTorch geometric graphs with optional FoldX data integration."
1937
parser.epilog = ("Example usage:\n"
2038
" python encode_pdbs.py /path/to/pdbs '*.pdb' output.h5 /path/to/foldx")
2139

2240
args = parser.parse_args()
2341

2442
if args.input_glob:
25-
files = glob.glob( args.input_glob )
43+
files = glob.glob(args.input_glob)
2644
else:
2745
files = glob.glob(os.path.join(args.input_dir, '*.pdb'))
46+
47+
# Shuffle the data for randomization
48+
np.random.shuffle(files)
49+
2850
output_h5 = args.output_h5
29-
if args.foldxdir:
30-
foldx = args.foldxdir
51+
foldx = args.foldxdir
52+
53+
# Create h5 dataset with pytorch geometric graphs
54+
# Using the same parameters as in the notebook
55+
if args.multiprocessing:
56+
converter.store_pyg_mp(
57+
files,
58+
filename=output_h5,
59+
foldxdir=foldx,
60+
verbose=args.verbose,
61+
add_prody=args.add_prody,
62+
ncpu=args.ncpu
63+
)
3164
else:
32-
foldx = None
33-
#create h5 dataset with pytorch geometric graphs
34-
converter.store_pyg(pdbfiles, filename= output_h5, foldxdir = foldx , verbose = False)
65+
converter.store_pyg(
66+
files,
67+
filename=output_h5,
68+
foldxdir=foldx,
69+
verbose=args.verbose,
70+
add_prody=args.add_prody,
71+
distance=args.distance
72+
)

foldtree2/notebooks/experiments/test_monodecoders.ipynb

Lines changed: 243 additions & 893 deletions
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)