From 8bb828762968a0b3400da43faec7f7321fbe09d1 Mon Sep 17 00:00:00 2001 From: Coos Baakman Date: Mon, 9 Jan 2017 13:19:15 +0100 Subject: [PATCH 1/6] fixes for annotater: use ftp library and delete tmp files --- annotate.py | 74 +++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 60 insertions(+), 14 deletions(-) diff --git a/annotate.py b/annotate.py index d655ca1..a8bc8cf 100755 --- a/annotate.py +++ b/annotate.py @@ -1,10 +1,12 @@ #!/usr/bin/python import sys,os,commands +from ftplib import FTP # Must import storage before utils import update_settings as settings from storage import storage + storage.uri = settings.MONGODB_URI storage.db_name = settings.MONGODB_DB_NAME storage.connect() @@ -27,6 +29,7 @@ # , # etc. + # Returns a list of triples: (comment, databank name, pdbid) def parse_comments (lines): @@ -73,6 +76,7 @@ def parse_comment(lines, entry): return '' + def update_entry (entry): databank_name = entry ['databank_name'] @@ -84,6 +88,7 @@ def update_entry (entry): else: storage.insert ('entries', entry) + # This function gets all comment information from a whynot # file and updates the corresponding entries with it. def annotate_from_file (path): @@ -116,8 +121,7 @@ def annotate_from_file (path): # else just check all other sources of information... - -# Check the files in the whynot comments directory: +print 'Check the files in the whynot comments directory' whynotdir = os.path.dirname (sys.argv [0]) commentsdir = os.path.join (whynotdir, 'comment') @@ -143,14 +147,16 @@ def annotate_from_file (path): # A pdb entry can contain only carbohydrates or only nucleic acids, in # which case no DSSP can be made. -pdbidscarbonly = Set () -pdbidsnuconly = Set () -pdbidsnmr = Set () -pdbidsem = Set () -pdbidsother = Set () -pdbidsdiff = Set () +pdbidscarbonly = Set() +pdbidsnuconly = Set() +pdbidsnmr = Set() +pdbidsem = Set() +pdbidsother = Set() +pdbidsdiff = Set() +pdbidssf = Set() +pdbidsnmrr = Set() -# Parse wwpdb entry type record +print 'Parse wwpdb entry type record' for line in read_http('ftp://ftp.wwpdb.org/pub/pdb/derived_data/pdb_entry_type.txt').split('\n'): if len(line.strip()) <= 0: continue @@ -171,8 +177,26 @@ def annotate_from_file (path): elif method=='other': pdbidsother.add(pdbid) -# Generate comments for missing structure factors. -# Do this wherever the experimental method is not diffraction: + +print 'Listing deposited structure factor files' +ftp = FTP('ftp.wwpdb.org') +ftp.login() +ftp.cwd('/pub/pdb/data/structures/divided/structure_factors/') +for part in ftp.nlst(): + for filename in ftp.nlst(part): + pdbid = filename[1: 5] + pdbidssf.add(pdbid) + + +print 'Listing deposited nmr restraints files' +ftp.cwd('/pub/pdb/data/structures/divided/nmr_restraints/') +for part in ftp.nlst(): + for filename in ftp.nlst(part): + pdbid = filename[0: 4] + pdbidsnmrr.add(pdbid) + + +print 'Generate comments for missing structure factors' for entry in get_unannotated_entries('STRUCTUREFACTORS'): pdbid = entry['pdbid'] @@ -191,12 +215,16 @@ def annotate_from_file (path): entry['comment'] = 'Not a Diffraction experiment' entry['mtime'] = time() + elif pdbid not in pdbidssf: + + entry['comment'] = 'Not deposited' + entry['mtime'] = time() + if 'comment' in entry: update_entry (entry) -# Generate comments for missing nmr data. -# Do this wherever the experimental method is not nmr: +print 'Generate comments for missing nmr data' for entry in get_unannotated_entries('NMR'): pdbid = entry['pdbid'] @@ -215,9 +243,16 @@ def annotate_from_file (path): entry['comment'] = 'Not an NMR experiment' entry['mtime'] = time() + elif pdbid not in pdbidsnmrr: + + entry['comment'] = 'Not deposited' + entry['mtime'] = time() + if 'comment' in entry: update_entry (entry) + +print 'Generate comments for missing hssp files' # To find out why HSSP entries are missing, one must check the error output of # mkhssp when it ran. It's been stored in a reserved directory: for entry in get_unannotated_entries('HSSP'): @@ -243,6 +278,8 @@ def annotate_from_file (path): entry ['mtime'] = time() update_entry (entry) + +print 'Generate comments for missing dssp files' # DSSP files can be missing for multiple reasons: # 1 the structure has no protein, carbohydrates/nucleic acids only # 2 the structure hase no backbone, only alpha carbon atoms @@ -277,7 +314,10 @@ def annotate_from_file (path): continue # Run dsspcmbi and catch stderr: - lines = commands.getoutput('%s %s /tmp/%s.dssp 2>&1 >/dev/null' % (mkdssp, inputfile, pdbid)).split('\n') + dsspfile = '/tmp/%s.dssp' % pdbid + lines = commands.getoutput('%s %s %s 2>&1 >/dev/null' % (mkdssp, inputfile, dsspfile)).split('\n') + if os.path.isfile(dsspfile): + os.remove(dsspfile) if lines [-1].strip () == 'empty protein, or no valid complete residues': entry['comment'] = 'No residues with complete backbone' # for backwards compatibility entry['mtime'] = time() @@ -285,6 +325,8 @@ def annotate_from_file (path): if 'comment' in entry: update_entry (entry) + +print 'Generate comments for missing bdb files' # BDB comments are simply stored in a file, generated by the bdb script. for entry in get_missing_entries('BDB'): @@ -301,6 +343,8 @@ def annotate_from_file (path): entry ['mtime'] = time() update_entry (entry) + +print 'Generate comments for whatif lists' # WHATIF list comments are simply stored in a file, generated by the script. for lis in ['acc', 'cal', 'cc1', 'cc2', 'cc3', 'chi', 'dsp', 'iod', 'sbh', 'sbr', 'ss1', 'ss2', 'tau', 'wat']: for src in ['pdb', 'redo']: @@ -320,6 +364,8 @@ def annotate_from_file (path): entry['mtime'] = time() update_entry (entry) + +print 'Generate comments for scenes' # WHATIF scene comments are simply stored in a file, generated by the script. for lis in ['iod', 'ss2']: for src in ['pdb', 'redo']: From 4b82815a00823a362d8c8a74bfb9ec12011ea94c Mon Sep 17 00:00:00 2001 From: Coos Baakman Date: Fri, 30 Jun 2017 12:33:55 +0200 Subject: [PATCH 2/6] add the new whynot file location --- annotate.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/annotate.py b/annotate.py index a8bc8cf..c6b3cc5 100755 --- a/annotate.py +++ b/annotate.py @@ -326,6 +326,22 @@ def annotate_from_file (path): update_entry (entry) +print 'Generate comments for missing pdbredo entries' +for entry in get_missing_entries('PDB_REDO'): + + pdbid = entry['pdbid'] + whynotfile = '/srv/data/pdb_redo/whynot/%s.txt' % pdbid + if not os.path.isfile(whynotfile): + continue + + lines = open(whynotfile, 'r').readlines() + comment = parse_comment(lines, entry) + if len(comment) > 0: + entry['comment'] = comment + entry['mtime'] = time() + update_entry(entry) + + print 'Generate comments for missing bdb files' # BDB comments are simply stored in a file, generated by the bdb script. for entry in get_missing_entries('BDB'): @@ -339,9 +355,9 @@ def annotate_from_file (path): lines = open(whynotfile, 'r').readlines() comment = parse_comment(lines, entry) if len(comment) > 0: - entry ['comment'] = comment - entry ['mtime'] = time() - update_entry (entry) + entry['comment'] = comment + entry['mtime'] = time() + update_entry(entry) print 'Generate comments for whatif lists' From aa0306b57954ad0eda9a88c3b0040d26ca7ebe63 Mon Sep 17 00:00:00 2001 From: Coos Baakman Date: Tue, 4 Jul 2017 12:17:27 +0200 Subject: [PATCH 3/6] fix to make comment parser accept PDB-REDO as PDB_REDO --- annotate.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/annotate.py b/annotate.py index c6b3cc5..40c2732 100755 --- a/annotate.py +++ b/annotate.py @@ -10,7 +10,7 @@ storage.uri = settings.MONGODB_URI storage.db_name = settings.MONGODB_DB_NAME storage.connect() -storage.authenticate ('whynotadmin', 'waivuy8N') +#storage.authenticate ('whynotadmin', 'waivuy8N') from utils import entries_by_pdbid, get_unannotated_entries, get_missing_entries, read_http @@ -31,7 +31,7 @@ # Returns a list of triples: (comment, databank name, pdbid) -def parse_comments (lines): +def parse_comments(lines): if len(lines) < 2: return {} @@ -46,6 +46,7 @@ def parse_comments (lines): elif ',' in line: databank_name, pdbid = line.strip ().replace (' ','').split (',') + databank_name.replace('-', '_') d.append ((comment, databank_name, pdbid)) elif len (line.strip ()) > 0: @@ -70,7 +71,7 @@ def parse_comment(lines, entry): for line in lines[1:]: - line = line.replace (' ','').strip () + line = line.replace (' ','').replace('-', '_').strip () if line == '%s,%s' % (entry ['databank_name'], entry ['pdbid']): return comment From aeacaa838183bb5ac5032f5c555c97d515d65b16 Mon Sep 17 00:00:00 2001 From: Coos Baakman Date: Thu, 6 Jul 2017 11:28:49 +0200 Subject: [PATCH 4/6] take mongo data from chelonium --- whynot_web/default_settings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/whynot_web/default_settings.py b/whynot_web/default_settings.py index 9d5bf3d..b2647b7 100644 --- a/whynot_web/default_settings.py +++ b/whynot_web/default_settings.py @@ -1,3 +1,3 @@ # mongo -MONGODB_URI = "mongodb://whynot_mongo_1" +MONGODB_URI = "mongodb://chelonium.cmbi.umcn.nl:27017" MONGODB_DB_NAME = "whynot" From 5e8e9e8c362b2aebf6b40d76c752b8ffc97b4876 Mon Sep 17 00:00:00 2001 From: Coos Baakman Date: Thu, 16 Nov 2017 14:16:11 +0100 Subject: [PATCH 5/6] make pdbfinder depend on hssp --- install.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/install.py b/install.py index 3983388..e861d23 100755 --- a/install.py +++ b/install.py @@ -52,7 +52,7 @@ def create_databanks(): re.compile(r'.*/([\w]{4})\.hssp.bz2'),FILE,'DSSP')) docs.append(_create_databank('PDBFINDER','http://swift.cmbi.ru.nl/gv/pdbfinder/', 'ftp://ftp.cmbi.ru.nl/pub/molbio/data/pdbfinder/PDBFIND.TXT.gz', - re.compile(r'ID : ([\w]{4})'),LINE,'PDB')) + re.compile(r'ID : ([\w]{4})'),LINE,'HSSP')) docs.append(_create_databank('PDBFINDER2','http://swift.cmbi.ru.nl/gv/pdbfinder/', 'ftp://ftp.cmbi.ru.nl/pub/molbio/data/pdbfinder2/PDBFIND2.TXT.gz', re.compile(r'ID : ([\w]{4})'),LINE,'PDBFINDER')) From cda88a38a5abaaebf851f4558e4395bc6abf0322 Mon Sep 17 00:00:00 2001 From: Coos Baakman Date: Thu, 19 Jul 2018 11:51:30 +0200 Subject: [PATCH 6/6] replaced cmbi.ru.nl by cmbi.umcn.nl --- install.py | 48 ++++++++++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/install.py b/install.py index e861d23..00b5412 100755 --- a/install.py +++ b/install.py @@ -41,20 +41,20 @@ def create_databanks(): docs.append(_create_databank('PDB','http://www.wwpdb.org/', 'ftp://ftp.wwpdb.org/pub/pdb/data/structures/divided/pdb/${PART}/pdb${PDBID}.ent.gz', re.compile(r'.*/pdb([\w]{4})\.ent(\.gz)?'),FILE,'MMCIF')) - docs.append(_create_databank('BDB','http://www.cmbi.ru.nl/bdb/', - 'ftp://ftp.cmbi.ru.nl/pub/molbio/data/bdb/${PART}/${PDBID}/${PDBID}.bdb', + docs.append(_create_databank('BDB','http://www.cmbi.umcn.nl/bdb/', + 'ftp://ftp.cmbi.umcn.nl/pub/molbio/data/bdb/${PART}/${PDBID}/${PDBID}.bdb', re.compile(r'.*/([\w]{4})\.bdb'),FILE,'PDB')) - docs.append(_create_databank('DSSP','http://swift.cmbi.ru.nl/gv/dssp/', - 'ftp://ftp.cmbi.ru.nl/pub/molbio/data/dssp/${PDBID}.dssp', + docs.append(_create_databank('DSSP','http://swift.cmbi.umcn.nl/gv/dssp/', + 'ftp://ftp.cmbi.umcn.nl/pub/molbio/data/dssp/${PDBID}.dssp', re.compile(r'.*/([\w]{4})\.dssp'),FILE,'MMCIF')) - docs.append(_create_databank('HSSP','http://swift.cmbi.ru.nl/gv/hssp/', - 'ftp://ftp.cmbi.ru.nl/pub/molbio/data/hssp/${PDBID}.hssp.bz2', + docs.append(_create_databank('HSSP','http://swift.cmbi.umcn.nl/gv/hssp/', + 'ftp://ftp.cmbi.umcn.nl/pub/molbio/data/hssp/${PDBID}.hssp.bz2', re.compile(r'.*/([\w]{4})\.hssp.bz2'),FILE,'DSSP')) - docs.append(_create_databank('PDBFINDER','http://swift.cmbi.ru.nl/gv/pdbfinder/', - 'ftp://ftp.cmbi.ru.nl/pub/molbio/data/pdbfinder/PDBFIND.TXT.gz', + docs.append(_create_databank('PDBFINDER','http://swift.cmbi.umcn.nl/gv/pdbfinder/', + 'ftp://ftp.cmbi.umcn.nl/pub/molbio/data/pdbfinder/PDBFIND.TXT.gz', re.compile(r'ID : ([\w]{4})'),LINE,'HSSP')) - docs.append(_create_databank('PDBFINDER2','http://swift.cmbi.ru.nl/gv/pdbfinder/', - 'ftp://ftp.cmbi.ru.nl/pub/molbio/data/pdbfinder2/PDBFIND2.TXT.gz', + docs.append(_create_databank('PDBFINDER2','http://swift.cmbi.umcn.nl/gv/pdbfinder/', + 'ftp://ftp.cmbi.umcn.nl/pub/molbio/data/pdbfinder2/PDBFIND2.TXT.gz', re.compile(r'ID : ([\w]{4})'),LINE,'PDBFINDER')) docs.append(_create_databank('NMR','http://www.bmrb.wisc.edu/', 'ftp://ftp.wwpdb.org/pub/pdb/data/structures/all/nmr_restraints/${PDBID}.mr.gz', @@ -62,32 +62,32 @@ def create_databanks(): docs.append(_create_databank('STRUCTUREFACTORS','http://www.pdb.org/', 'ftp://ftp.wwpdb.org/pub/pdb/data/structures/divided/structure_factors/${PART}/r${PDBID}sf.ent.gz', re.compile(r'.*/r([\w]{4})sf\.ent\.gz'),FILE,'MMCIF')) - docs.append(_create_databank('PDBREPORT','http://swift.cmbi.ru.nl/gv/pdbreport/', - 'http://www.cmbi.ru.nl/pdbreport/cgi-bin/nonotes?PDBID=${PDBID}', + docs.append(_create_databank('PDBREPORT','http://swift.cmbi.umcn.nl/gv/pdbreport/', + 'http://www.cmbi.umcn.nl/pdbreport/cgi-bin/nonotes?PDBID=${PDBID}', re.compile(r'pdbreport\/\w{2}\/(\w{4})\/pdbout\.txt'),FILE,'PDB')) - docs.append(_create_databank('PDB_REDO','http://www.cmbi.ru.nl/pdb_redo/', - 'http://www.cmbi.ru.nl/pdb_redo/cgi-bin/redir2.pl?pdbCode=${PDBID}', + docs.append(_create_databank('PDB_REDO','http://www.cmbi.umcn.nl/pdb_redo/', + 'http://www.cmbi.umcn.nl/pdb_redo/cgi-bin/redir2.pl?pdbCode=${PDBID}', re.compile(r'\/\w{2}\/\w{4}\/(\w{4})_final\.pdb'),FILE,'STRUCTUREFACTORS')) - docs.append(_create_databank('DSSP_REDO','http://swift.cmbi.ru.nl/gv/dssp/', - 'ftp://ftp.cmbi.ru.nl/pub/molbio/data/dssp_redo/${PDBID}.dssp', + docs.append(_create_databank('DSSP_REDO','http://swift.cmbi.umcn.nl/gv/dssp/', + 'ftp://ftp.cmbi.umcn.nl/pub/molbio/data/dssp_redo/${PDBID}.dssp', re.compile(r'.*/([\w]{4})\.dssp'),FILE,'PDB_REDO')) for lis in ['dsp','iod','sbh','sbr','ss1','ss2','tau','acc','cal','wat', 'cc1','cc2','cc3','chi']: - docs.append(_create_databank('WHATIF_PDB_%s' % lis, 'http://swift.cmbi.ru.nl/whatif/', - 'ftp://ftp.cmbi.ru.nl/pub/molbio/data/wi-lists/pdb/%s/${PDBID}/${PDBID}.%s.bz2' % (lis, lis), + docs.append(_create_databank('WHATIF_PDB_%s' % lis, 'http://swift.cmbi.umcn.nl/whatif/', + 'ftp://ftp.cmbi.umcn.nl/pub/molbio/data/wi-lists/pdb/%s/${PDBID}/${PDBID}.%s.bz2' % (lis, lis), re.compile(r'.*/([\w]{4})\.' + lis + r'(\.bz2)?$'),FILE,'PDB')) - docs.append(_create_databank('WHATIF_REDO_%s' % lis, 'http://swift.cmbi.ru.nl/whatif/', - 'ftp://ftp.cmbi.ru.nl/pub/molbio/data/wi-lists/redo/%s/${PDBID}/${PDBID}.%s.bz2' % (lis, lis), + docs.append(_create_databank('WHATIF_REDO_%s' % lis, 'http://swift.cmbi.umcn.nl/whatif/', + 'ftp://ftp.cmbi.umcn.nl/pub/molbio/data/wi-lists/redo/%s/${PDBID}/${PDBID}.%s.bz2' % (lis, lis), re.compile(r'.*/([\w]{4})\.' + lis + r'(\.bz2)?$'),FILE,'PDB_REDO')) scenames = { 'ss2': 'sym-contacts', 'iod': 'ion-sites'} for lis in scenames: - docs.append(_create_databank('PDB_SCENES_%s' % lis, 'http://www.cmbi.ru.nl/pdb-vis/', - 'ftp://ftp.cmbi.ru.nl/pub/molbio/data/wi-lists/pdb/scenes/%s/${PDBID}/${PDBID}_%s.sce' % (lis, scenames[lis]), + docs.append(_create_databank('PDB_SCENES_%s' % lis, 'http://www.cmbi.umcn.nl/pdb-vis/', + 'ftp://ftp.cmbi.umcn.nl/pub/molbio/data/wi-lists/pdb/scenes/%s/${PDBID}/${PDBID}_%s.sce' % (lis, scenames[lis]), re.compile(r'.*/([\w]{4})_' + scenames[lis] + r'\.sce'),FILE,'WHATIF_PDB_%s' % lis)) - docs.append(_create_databank('REDO_SCENES_%s' % lis, 'http://www.cmbi.ru.nl/pdb-vis/', - 'ftp://ftp.cmbi.ru.nl/pub/molbio/data/wi-lists/redo/scenes/%s/${PDBID}/${PDBID}_%s.sce' % (lis, scenames[lis]), + docs.append(_create_databank('REDO_SCENES_%s' % lis, 'http://www.cmbi.umcn.nl/pdb-vis/', + 'ftp://ftp.cmbi.umcn.nl/pub/molbio/data/wi-lists/redo/scenes/%s/${PDBID}/${PDBID}_%s.sce' % (lis, scenames[lis]), re.compile(r'.*/([\w]{4})_' + scenames[lis] + r'\.sce'),FILE,'WHATIF_REDO_%s' % lis)) return docs