diff --git a/annotate.py b/annotate.py index d655ca1..40c2732 100755 --- a/annotate.py +++ b/annotate.py @@ -1,14 +1,16 @@ #!/usr/bin/python import sys,os,commands +from ftplib import FTP # Must import storage before utils import update_settings as settings from storage import storage + storage.uri = settings.MONGODB_URI storage.db_name = settings.MONGODB_DB_NAME storage.connect() -storage.authenticate ('whynotadmin', 'waivuy8N') +#storage.authenticate ('whynotadmin', 'waivuy8N') from utils import entries_by_pdbid, get_unannotated_entries, get_missing_entries, read_http @@ -27,8 +29,9 @@ # , # etc. + # Returns a list of triples: (comment, databank name, pdbid) -def parse_comments (lines): +def parse_comments(lines): if len(lines) < 2: return {} @@ -43,6 +46,7 @@ def parse_comments (lines): elif ',' in line: databank_name, pdbid = line.strip ().replace (' ','').split (',') + databank_name.replace('-', '_') d.append ((comment, databank_name, pdbid)) elif len (line.strip ()) > 0: @@ -67,12 +71,13 @@ def parse_comment(lines, entry): for line in lines[1:]: - line = line.replace (' ','').strip () + line = line.replace (' ','').replace('-', '_').strip () if line == '%s,%s' % (entry ['databank_name'], entry ['pdbid']): return comment return '' + def update_entry (entry): databank_name = entry ['databank_name'] @@ -84,6 +89,7 @@ def update_entry (entry): else: storage.insert ('entries', entry) + # This function gets all comment information from a whynot # file and updates the corresponding entries with it. def annotate_from_file (path): @@ -116,8 +122,7 @@ def annotate_from_file (path): # else just check all other sources of information... - -# Check the files in the whynot comments directory: +print 'Check the files in the whynot comments directory' whynotdir = os.path.dirname (sys.argv [0]) commentsdir = os.path.join (whynotdir, 'comment') @@ -143,14 +148,16 @@ def annotate_from_file (path): # A pdb entry can contain only carbohydrates or only nucleic acids, in # which case no DSSP can be made. -pdbidscarbonly = Set () -pdbidsnuconly = Set () -pdbidsnmr = Set () -pdbidsem = Set () -pdbidsother = Set () -pdbidsdiff = Set () +pdbidscarbonly = Set() +pdbidsnuconly = Set() +pdbidsnmr = Set() +pdbidsem = Set() +pdbidsother = Set() +pdbidsdiff = Set() +pdbidssf = Set() +pdbidsnmrr = Set() -# Parse wwpdb entry type record +print 'Parse wwpdb entry type record' for line in read_http('ftp://ftp.wwpdb.org/pub/pdb/derived_data/pdb_entry_type.txt').split('\n'): if len(line.strip()) <= 0: continue @@ -171,8 +178,26 @@ def annotate_from_file (path): elif method=='other': pdbidsother.add(pdbid) -# Generate comments for missing structure factors. -# Do this wherever the experimental method is not diffraction: + +print 'Listing deposited structure factor files' +ftp = FTP('ftp.wwpdb.org') +ftp.login() +ftp.cwd('/pub/pdb/data/structures/divided/structure_factors/') +for part in ftp.nlst(): + for filename in ftp.nlst(part): + pdbid = filename[1: 5] + pdbidssf.add(pdbid) + + +print 'Listing deposited nmr restraints files' +ftp.cwd('/pub/pdb/data/structures/divided/nmr_restraints/') +for part in ftp.nlst(): + for filename in ftp.nlst(part): + pdbid = filename[0: 4] + pdbidsnmrr.add(pdbid) + + +print 'Generate comments for missing structure factors' for entry in get_unannotated_entries('STRUCTUREFACTORS'): pdbid = entry['pdbid'] @@ -191,12 +216,16 @@ def annotate_from_file (path): entry['comment'] = 'Not a Diffraction experiment' entry['mtime'] = time() + elif pdbid not in pdbidssf: + + entry['comment'] = 'Not deposited' + entry['mtime'] = time() + if 'comment' in entry: update_entry (entry) -# Generate comments for missing nmr data. -# Do this wherever the experimental method is not nmr: +print 'Generate comments for missing nmr data' for entry in get_unannotated_entries('NMR'): pdbid = entry['pdbid'] @@ -215,9 +244,16 @@ def annotate_from_file (path): entry['comment'] = 'Not an NMR experiment' entry['mtime'] = time() + elif pdbid not in pdbidsnmrr: + + entry['comment'] = 'Not deposited' + entry['mtime'] = time() + if 'comment' in entry: update_entry (entry) + +print 'Generate comments for missing hssp files' # To find out why HSSP entries are missing, one must check the error output of # mkhssp when it ran. It's been stored in a reserved directory: for entry in get_unannotated_entries('HSSP'): @@ -243,6 +279,8 @@ def annotate_from_file (path): entry ['mtime'] = time() update_entry (entry) + +print 'Generate comments for missing dssp files' # DSSP files can be missing for multiple reasons: # 1 the structure has no protein, carbohydrates/nucleic acids only # 2 the structure hase no backbone, only alpha carbon atoms @@ -277,7 +315,10 @@ def annotate_from_file (path): continue # Run dsspcmbi and catch stderr: - lines = commands.getoutput('%s %s /tmp/%s.dssp 2>&1 >/dev/null' % (mkdssp, inputfile, pdbid)).split('\n') + dsspfile = '/tmp/%s.dssp' % pdbid + lines = commands.getoutput('%s %s %s 2>&1 >/dev/null' % (mkdssp, inputfile, dsspfile)).split('\n') + if os.path.isfile(dsspfile): + os.remove(dsspfile) if lines [-1].strip () == 'empty protein, or no valid complete residues': entry['comment'] = 'No residues with complete backbone' # for backwards compatibility entry['mtime'] = time() @@ -285,6 +326,24 @@ def annotate_from_file (path): if 'comment' in entry: update_entry (entry) + +print 'Generate comments for missing pdbredo entries' +for entry in get_missing_entries('PDB_REDO'): + + pdbid = entry['pdbid'] + whynotfile = '/srv/data/pdb_redo/whynot/%s.txt' % pdbid + if not os.path.isfile(whynotfile): + continue + + lines = open(whynotfile, 'r').readlines() + comment = parse_comment(lines, entry) + if len(comment) > 0: + entry['comment'] = comment + entry['mtime'] = time() + update_entry(entry) + + +print 'Generate comments for missing bdb files' # BDB comments are simply stored in a file, generated by the bdb script. for entry in get_missing_entries('BDB'): @@ -297,10 +356,12 @@ def annotate_from_file (path): lines = open(whynotfile, 'r').readlines() comment = parse_comment(lines, entry) if len(comment) > 0: - entry ['comment'] = comment - entry ['mtime'] = time() - update_entry (entry) + entry['comment'] = comment + entry['mtime'] = time() + update_entry(entry) + +print 'Generate comments for whatif lists' # WHATIF list comments are simply stored in a file, generated by the script. for lis in ['acc', 'cal', 'cc1', 'cc2', 'cc3', 'chi', 'dsp', 'iod', 'sbh', 'sbr', 'ss1', 'ss2', 'tau', 'wat']: for src in ['pdb', 'redo']: @@ -320,6 +381,8 @@ def annotate_from_file (path): entry['mtime'] = time() update_entry (entry) + +print 'Generate comments for scenes' # WHATIF scene comments are simply stored in a file, generated by the script. for lis in ['iod', 'ss2']: for src in ['pdb', 'redo']: diff --git a/install.py b/install.py index 3983388..00b5412 100755 --- a/install.py +++ b/install.py @@ -41,20 +41,20 @@ def create_databanks(): docs.append(_create_databank('PDB','http://www.wwpdb.org/', 'ftp://ftp.wwpdb.org/pub/pdb/data/structures/divided/pdb/${PART}/pdb${PDBID}.ent.gz', re.compile(r'.*/pdb([\w]{4})\.ent(\.gz)?'),FILE,'MMCIF')) - docs.append(_create_databank('BDB','http://www.cmbi.ru.nl/bdb/', - 'ftp://ftp.cmbi.ru.nl/pub/molbio/data/bdb/${PART}/${PDBID}/${PDBID}.bdb', + docs.append(_create_databank('BDB','http://www.cmbi.umcn.nl/bdb/', + 'ftp://ftp.cmbi.umcn.nl/pub/molbio/data/bdb/${PART}/${PDBID}/${PDBID}.bdb', re.compile(r'.*/([\w]{4})\.bdb'),FILE,'PDB')) - docs.append(_create_databank('DSSP','http://swift.cmbi.ru.nl/gv/dssp/', - 'ftp://ftp.cmbi.ru.nl/pub/molbio/data/dssp/${PDBID}.dssp', + docs.append(_create_databank('DSSP','http://swift.cmbi.umcn.nl/gv/dssp/', + 'ftp://ftp.cmbi.umcn.nl/pub/molbio/data/dssp/${PDBID}.dssp', re.compile(r'.*/([\w]{4})\.dssp'),FILE,'MMCIF')) - docs.append(_create_databank('HSSP','http://swift.cmbi.ru.nl/gv/hssp/', - 'ftp://ftp.cmbi.ru.nl/pub/molbio/data/hssp/${PDBID}.hssp.bz2', + docs.append(_create_databank('HSSP','http://swift.cmbi.umcn.nl/gv/hssp/', + 'ftp://ftp.cmbi.umcn.nl/pub/molbio/data/hssp/${PDBID}.hssp.bz2', re.compile(r'.*/([\w]{4})\.hssp.bz2'),FILE,'DSSP')) - docs.append(_create_databank('PDBFINDER','http://swift.cmbi.ru.nl/gv/pdbfinder/', - 'ftp://ftp.cmbi.ru.nl/pub/molbio/data/pdbfinder/PDBFIND.TXT.gz', - re.compile(r'ID : ([\w]{4})'),LINE,'PDB')) - docs.append(_create_databank('PDBFINDER2','http://swift.cmbi.ru.nl/gv/pdbfinder/', - 'ftp://ftp.cmbi.ru.nl/pub/molbio/data/pdbfinder2/PDBFIND2.TXT.gz', + docs.append(_create_databank('PDBFINDER','http://swift.cmbi.umcn.nl/gv/pdbfinder/', + 'ftp://ftp.cmbi.umcn.nl/pub/molbio/data/pdbfinder/PDBFIND.TXT.gz', + re.compile(r'ID : ([\w]{4})'),LINE,'HSSP')) + docs.append(_create_databank('PDBFINDER2','http://swift.cmbi.umcn.nl/gv/pdbfinder/', + 'ftp://ftp.cmbi.umcn.nl/pub/molbio/data/pdbfinder2/PDBFIND2.TXT.gz', re.compile(r'ID : ([\w]{4})'),LINE,'PDBFINDER')) docs.append(_create_databank('NMR','http://www.bmrb.wisc.edu/', 'ftp://ftp.wwpdb.org/pub/pdb/data/structures/all/nmr_restraints/${PDBID}.mr.gz', @@ -62,32 +62,32 @@ def create_databanks(): docs.append(_create_databank('STRUCTUREFACTORS','http://www.pdb.org/', 'ftp://ftp.wwpdb.org/pub/pdb/data/structures/divided/structure_factors/${PART}/r${PDBID}sf.ent.gz', re.compile(r'.*/r([\w]{4})sf\.ent\.gz'),FILE,'MMCIF')) - docs.append(_create_databank('PDBREPORT','http://swift.cmbi.ru.nl/gv/pdbreport/', - 'http://www.cmbi.ru.nl/pdbreport/cgi-bin/nonotes?PDBID=${PDBID}', + docs.append(_create_databank('PDBREPORT','http://swift.cmbi.umcn.nl/gv/pdbreport/', + 'http://www.cmbi.umcn.nl/pdbreport/cgi-bin/nonotes?PDBID=${PDBID}', re.compile(r'pdbreport\/\w{2}\/(\w{4})\/pdbout\.txt'),FILE,'PDB')) - docs.append(_create_databank('PDB_REDO','http://www.cmbi.ru.nl/pdb_redo/', - 'http://www.cmbi.ru.nl/pdb_redo/cgi-bin/redir2.pl?pdbCode=${PDBID}', + docs.append(_create_databank('PDB_REDO','http://www.cmbi.umcn.nl/pdb_redo/', + 'http://www.cmbi.umcn.nl/pdb_redo/cgi-bin/redir2.pl?pdbCode=${PDBID}', re.compile(r'\/\w{2}\/\w{4}\/(\w{4})_final\.pdb'),FILE,'STRUCTUREFACTORS')) - docs.append(_create_databank('DSSP_REDO','http://swift.cmbi.ru.nl/gv/dssp/', - 'ftp://ftp.cmbi.ru.nl/pub/molbio/data/dssp_redo/${PDBID}.dssp', + docs.append(_create_databank('DSSP_REDO','http://swift.cmbi.umcn.nl/gv/dssp/', + 'ftp://ftp.cmbi.umcn.nl/pub/molbio/data/dssp_redo/${PDBID}.dssp', re.compile(r'.*/([\w]{4})\.dssp'),FILE,'PDB_REDO')) for lis in ['dsp','iod','sbh','sbr','ss1','ss2','tau','acc','cal','wat', 'cc1','cc2','cc3','chi']: - docs.append(_create_databank('WHATIF_PDB_%s' % lis, 'http://swift.cmbi.ru.nl/whatif/', - 'ftp://ftp.cmbi.ru.nl/pub/molbio/data/wi-lists/pdb/%s/${PDBID}/${PDBID}.%s.bz2' % (lis, lis), + docs.append(_create_databank('WHATIF_PDB_%s' % lis, 'http://swift.cmbi.umcn.nl/whatif/', + 'ftp://ftp.cmbi.umcn.nl/pub/molbio/data/wi-lists/pdb/%s/${PDBID}/${PDBID}.%s.bz2' % (lis, lis), re.compile(r'.*/([\w]{4})\.' + lis + r'(\.bz2)?$'),FILE,'PDB')) - docs.append(_create_databank('WHATIF_REDO_%s' % lis, 'http://swift.cmbi.ru.nl/whatif/', - 'ftp://ftp.cmbi.ru.nl/pub/molbio/data/wi-lists/redo/%s/${PDBID}/${PDBID}.%s.bz2' % (lis, lis), + docs.append(_create_databank('WHATIF_REDO_%s' % lis, 'http://swift.cmbi.umcn.nl/whatif/', + 'ftp://ftp.cmbi.umcn.nl/pub/molbio/data/wi-lists/redo/%s/${PDBID}/${PDBID}.%s.bz2' % (lis, lis), re.compile(r'.*/([\w]{4})\.' + lis + r'(\.bz2)?$'),FILE,'PDB_REDO')) scenames = { 'ss2': 'sym-contacts', 'iod': 'ion-sites'} for lis in scenames: - docs.append(_create_databank('PDB_SCENES_%s' % lis, 'http://www.cmbi.ru.nl/pdb-vis/', - 'ftp://ftp.cmbi.ru.nl/pub/molbio/data/wi-lists/pdb/scenes/%s/${PDBID}/${PDBID}_%s.sce' % (lis, scenames[lis]), + docs.append(_create_databank('PDB_SCENES_%s' % lis, 'http://www.cmbi.umcn.nl/pdb-vis/', + 'ftp://ftp.cmbi.umcn.nl/pub/molbio/data/wi-lists/pdb/scenes/%s/${PDBID}/${PDBID}_%s.sce' % (lis, scenames[lis]), re.compile(r'.*/([\w]{4})_' + scenames[lis] + r'\.sce'),FILE,'WHATIF_PDB_%s' % lis)) - docs.append(_create_databank('REDO_SCENES_%s' % lis, 'http://www.cmbi.ru.nl/pdb-vis/', - 'ftp://ftp.cmbi.ru.nl/pub/molbio/data/wi-lists/redo/scenes/%s/${PDBID}/${PDBID}_%s.sce' % (lis, scenames[lis]), + docs.append(_create_databank('REDO_SCENES_%s' % lis, 'http://www.cmbi.umcn.nl/pdb-vis/', + 'ftp://ftp.cmbi.umcn.nl/pub/molbio/data/wi-lists/redo/scenes/%s/${PDBID}/${PDBID}_%s.sce' % (lis, scenames[lis]), re.compile(r'.*/([\w]{4})_' + scenames[lis] + r'\.sce'),FILE,'WHATIF_REDO_%s' % lis)) return docs diff --git a/whynot_web/default_settings.py b/whynot_web/default_settings.py index 9d5bf3d..b2647b7 100644 --- a/whynot_web/default_settings.py +++ b/whynot_web/default_settings.py @@ -1,3 +1,3 @@ # mongo -MONGODB_URI = "mongodb://whynot_mongo_1" +MONGODB_URI = "mongodb://chelonium.cmbi.umcn.nl:27017" MONGODB_DB_NAME = "whynot"