From 8bb828762968a0b3400da43faec7f7321fbe09d1 Mon Sep 17 00:00:00 2001
From: Coos Baakman <cbaakman@chelonium.cmbi.umcn.nl>
Date: Mon, 9 Jan 2017 13:19:15 +0100
Subject: [PATCH 1/6] fixes for annotater: use ftp library and delete tmp files

---
 annotate.py | 74 +++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 60 insertions(+), 14 deletions(-)
diff --git a/annotate.py b/annotate.py
index d655ca1..a8bc8cf 100755
--- a/annotate.py
+++ b/annotate.py
@@ -1,10 +1,12 @@
 #!/usr/bin/python
 
 import sys,os,commands
+from ftplib import FTP
 
 # Must import storage before utils
 import update_settings as settings
 from storage import storage
+
 storage.uri = settings.MONGODB_URI
 storage.db_name = settings.MONGODB_DB_NAME
 storage.connect()
@@ -27,6 +29,7 @@
 # <databank name>, <pdbid 5>
 # etc.
 
+
 # Returns a list of triples: (comment, databank name, pdbid)
 def parse_comments (lines):
 
@@ -73,6 +76,7 @@ def parse_comment(lines, entry):
 
     return ''
 
+
 def update_entry (entry):
 
     databank_name = entry ['databank_name']
@@ -84,6 +88,7 @@ def update_entry (entry):
     else:
         storage.insert ('entries', entry)
 
+
 # This function gets all comment information from a whynot
 # file and updates the corresponding entries with it.
 def annotate_from_file (path):
@@ -116,8 +121,7 @@ def annotate_from_file (path):
 # else just check all other sources of information...
 
 
-
-# Check the files in the whynot comments directory:
+print 'Check the files in the whynot comments directory'
 
 whynotdir = os.path.dirname (sys.argv [0])
 commentsdir = os.path.join (whynotdir, 'comment')
@@ -143,14 +147,16 @@ def annotate_from_file (path):
 # A pdb entry can contain only carbohydrates or only nucleic acids, in
 # which case no DSSP can be made.
 
-pdbidscarbonly = Set ()
-pdbidsnuconly = Set ()
-pdbidsnmr = Set ()
-pdbidsem = Set ()
-pdbidsother = Set ()
-pdbidsdiff = Set ()
+pdbidscarbonly = Set()
+pdbidsnuconly = Set()
+pdbidsnmr = Set()
+pdbidsem = Set()
+pdbidsother = Set()
+pdbidsdiff = Set()
+pdbidssf = Set()
+pdbidsnmrr = Set()
 
-# Parse wwpdb entry type record
+print 'Parse wwpdb entry type record'
 for line in read_http('ftp://ftp.wwpdb.org/pub/pdb/derived_data/pdb_entry_type.txt').split('\n'):
     if len(line.strip()) <= 0:
         continue
@@ -171,8 +177,26 @@ def annotate_from_file (path):
     elif method=='other':
         pdbidsother.add(pdbid)
 
-# Generate comments for missing structure factors.
-# Do this wherever the experimental method is not diffraction:
+
+print 'Listing deposited structure factor files'
+ftp = FTP('ftp.wwpdb.org')
+ftp.login()
+ftp.cwd('/pub/pdb/data/structures/divided/structure_factors/')
+for part in ftp.nlst():
+    for filename in ftp.nlst(part):
+        pdbid = filename[1: 5]
+        pdbidssf.add(pdbid)
+
+
+print 'Listing deposited nmr restraints files'
+ftp.cwd('/pub/pdb/data/structures/divided/nmr_restraints/')
+for part in ftp.nlst():
+    for filename in ftp.nlst(part):
+        pdbid = filename[0: 4]
+        pdbidsnmrr.add(pdbid)
+
+
+print 'Generate comments for missing structure factors'
 for entry in get_unannotated_entries('STRUCTUREFACTORS'):
 
     pdbid = entry['pdbid']
@@ -191,12 +215,16 @@ def annotate_from_file (path):
         entry['comment'] = 'Not a Diffraction experiment'
         entry['mtime'] = time()
 
+    elif pdbid not in pdbidssf:
+
+	entry['comment'] = 'Not deposited'
+	entry['mtime'] = time()
+
     if 'comment' in entry:
         update_entry (entry)
 
 
-# Generate comments for missing nmr data.
-# Do this wherever the experimental method is not nmr:
+print 'Generate comments for missing nmr data'
 for entry in get_unannotated_entries('NMR'):
 
     pdbid = entry['pdbid']
@@ -215,9 +243,16 @@ def annotate_from_file (path):
         entry['comment'] = 'Not an NMR experiment'
         entry['mtime'] = time()
 
+    elif pdbid not in pdbidsnmrr:
+
+	entry['comment'] = 'Not deposited'
+	entry['mtime'] = time()
+
     if 'comment' in entry:
         update_entry (entry)
 
+
+print 'Generate comments for missing hssp files'
 # To find out why HSSP entries are missing, one must check the error output of
 # mkhssp when it ran. It's been stored in a reserved directory:
 for entry in get_unannotated_entries('HSSP'):
@@ -243,6 +278,8 @@ def annotate_from_file (path):
         entry ['mtime'] = time()
         update_entry (entry)
 
+
+print 'Generate comments for missing dssp files'
 # DSSP files can be missing for multiple reasons:
 # 1 the structure has no protein, carbohydrates/nucleic acids only
 # 2 the structure hase no backbone, only alpha carbon atoms
@@ -277,7 +314,10 @@ def annotate_from_file (path):
                     continue
 
             # Run dsspcmbi and catch stderr:
-            lines = commands.getoutput('%s %s /tmp/%s.dssp 2>&1 >/dev/null' % (mkdssp, inputfile, pdbid)).split('\n')
+	    dsspfile = '/tmp/%s.dssp' % pdbid
+            lines = commands.getoutput('%s %s %s 2>&1 >/dev/null' % (mkdssp, inputfile, dsspfile)).split('\n')
+	    if os.path.isfile(dsspfile):
+		os.remove(dsspfile)
             if lines [-1].strip () == 'empty protein, or no valid complete residues':
                 entry['comment'] = 'No residues with complete backbone' # for backwards compatibility
                 entry['mtime'] = time()
@@ -285,6 +325,8 @@ def annotate_from_file (path):
         if 'comment' in entry:
             update_entry (entry)
 
+
+print 'Generate comments for missing bdb files'
 # BDB comments are simply stored in a file, generated by the bdb script.
 for entry in get_missing_entries('BDB'):
 
@@ -301,6 +343,8 @@ def annotate_from_file (path):
         entry ['mtime'] = time()
         update_entry (entry)
 
+
+print 'Generate comments for whatif lists'
 # WHATIF list comments are simply stored in a file, generated by the script.
 for lis in ['acc', 'cal', 'cc1', 'cc2', 'cc3', 'chi', 'dsp', 'iod', 'sbh', 'sbr', 'ss1', 'ss2', 'tau', 'wat']:
     for src in ['pdb', 'redo']:
@@ -320,6 +364,8 @@ def annotate_from_file (path):
                 entry['mtime'] = time()
                 update_entry (entry)
 
+
+print 'Generate comments for scenes'
 # WHATIF scene comments are simply stored in a file, generated by the script.
 for lis in ['iod', 'ss2']:
     for src in ['pdb', 'redo']:

From 4b82815a00823a362d8c8a74bfb9ec12011ea94c Mon Sep 17 00:00:00 2001
From: Coos Baakman <cbaakman@chelonium.cmbi.umcn.nl>
Date: Fri, 30 Jun 2017 12:33:55 +0200
Subject: [PATCH 2/6] add the new whynot file location

---
 annotate.py | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/annotate.py b/annotate.py
index a8bc8cf..c6b3cc5 100755
--- a/annotate.py
+++ b/annotate.py
@@ -326,6 +326,22 @@ def annotate_from_file (path):
             update_entry (entry)
 
 
+print 'Generate comments for missing pdbredo entries'
+for entry in get_missing_entries('PDB_REDO'):
+
+    pdbid = entry['pdbid']
+    whynotfile = '/srv/data/pdb_redo/whynot/%s.txt' % pdbid
+    if not os.path.isfile(whynotfile):
+        continue
+
+    lines = open(whynotfile, 'r').readlines()
+    comment = parse_comment(lines, entry)
+    if len(comment) > 0:
+        entry['comment'] = comment
+        entry['mtime'] = time()
+        update_entry(entry)
+
+
 print 'Generate comments for missing bdb files'
 # BDB comments are simply stored in a file, generated by the bdb script.
 for entry in get_missing_entries('BDB'):
@@ -339,9 +355,9 @@ def annotate_from_file (path):
     lines = open(whynotfile, 'r').readlines()
     comment = parse_comment(lines, entry)
     if len(comment) > 0:
-        entry ['comment'] = comment
-        entry ['mtime'] = time()
-        update_entry (entry)
+        entry['comment'] = comment
+        entry['mtime'] = time()
+        update_entry(entry)
 
 
 print 'Generate comments for whatif lists'

From aa0306b57954ad0eda9a88c3b0040d26ca7ebe63 Mon Sep 17 00:00:00 2001
From: Coos Baakman <cbaakman@chelonium.cmbi.umcn.nl>
Date: Tue, 4 Jul 2017 12:17:27 +0200
Subject: [PATCH 3/6] fix to make comment parser accept PDB-REDO as PDB_REDO

---
 annotate.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/annotate.py b/annotate.py
index c6b3cc5..40c2732 100755
--- a/annotate.py
+++ b/annotate.py
@@ -10,7 +10,7 @@
 storage.uri = settings.MONGODB_URI
 storage.db_name = settings.MONGODB_DB_NAME
 storage.connect()
-storage.authenticate ('whynotadmin', 'waivuy8N')
+#storage.authenticate ('whynotadmin', 'waivuy8N')
 
 from utils import entries_by_pdbid, get_unannotated_entries, get_missing_entries, read_http
 
@@ -31,7 +31,7 @@
 
 
 # Returns a list of triples: (comment, databank name, pdbid)
-def parse_comments (lines):
+def parse_comments(lines):
 
     if len(lines) < 2:
         return {}
@@ -46,6 +46,7 @@ def parse_comments (lines):
         elif ',' in line:
 
             databank_name, pdbid = line.strip ().replace (' ','').split (',')
+            databank_name.replace('-', '_')
             d.append ((comment, databank_name, pdbid))
 
         elif len (line.strip ()) > 0:
@@ -70,7 +71,7 @@ def parse_comment(lines, entry):
 
     for line in lines[1:]:
 
-        line = line.replace (' ','').strip ()
+        line = line.replace (' ','').replace('-', '_').strip ()
         if line == '%s,%s' % (entry ['databank_name'], entry ['pdbid']):
             return comment
 

From aeacaa838183bb5ac5032f5c555c97d515d65b16 Mon Sep 17 00:00:00 2001
From: Coos Baakman <cbaakman@cmbi23.cmbi.umcn.nl>
Date: Thu, 6 Jul 2017 11:28:49 +0200
Subject: [PATCH 4/6] take mongo data from chelonium

---
 whynot_web/default_settings.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/whynot_web/default_settings.py b/whynot_web/default_settings.py
index 9d5bf3d..b2647b7 100644
--- a/whynot_web/default_settings.py
+++ b/whynot_web/default_settings.py
@@ -1,3 +1,3 @@
 # mongo
-MONGODB_URI = "mongodb://whynot_mongo_1"
+MONGODB_URI = "mongodb://chelonium.cmbi.umcn.nl:27017"
 MONGODB_DB_NAME = "whynot"

From 5e8e9e8c362b2aebf6b40d76c752b8ffc97b4876 Mon Sep 17 00:00:00 2001
From: Coos Baakman <c.baakman@radboudumc.nl.com>
Date: Thu, 16 Nov 2017 14:16:11 +0100
Subject: [PATCH 5/6] make pdbfinder depend on hssp

---
 install.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/install.py b/install.py
index 3983388..e861d23 100755
--- a/install.py
+++ b/install.py
@@ -52,7 +52,7 @@ def create_databanks():
         re.compile(r'.*/([\w]{4})\.hssp.bz2'),FILE,'DSSP'))
     docs.append(_create_databank('PDBFINDER','http://swift.cmbi.ru.nl/gv/pdbfinder/',
         'ftp://ftp.cmbi.ru.nl/pub/molbio/data/pdbfinder/PDBFIND.TXT.gz',
-        re.compile(r'ID           : ([\w]{4})'),LINE,'PDB'))
+        re.compile(r'ID           : ([\w]{4})'),LINE,'HSSP'))
     docs.append(_create_databank('PDBFINDER2','http://swift.cmbi.ru.nl/gv/pdbfinder/',
         'ftp://ftp.cmbi.ru.nl/pub/molbio/data/pdbfinder2/PDBFIND2.TXT.gz',
         re.compile(r'ID           : ([\w]{4})'),LINE,'PDBFINDER'))

From cda88a38a5abaaebf851f4558e4395bc6abf0322 Mon Sep 17 00:00:00 2001
From: Coos Baakman <c.baakman@radboudumc.nl.com>
Date: Thu, 19 Jul 2018 11:51:30 +0200
Subject: [PATCH 6/6] replaced cmbi.ru.nl by cmbi.umcn.nl

---
 install.py | 48 ++++++++++++++++++++++++------------------------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/install.py b/install.py
index e861d23..00b5412 100755
--- a/install.py
+++ b/install.py
@@ -41,20 +41,20 @@ def create_databanks():
     docs.append(_create_databank('PDB','http://www.wwpdb.org/',
         'ftp://ftp.wwpdb.org/pub/pdb/data/structures/divided/pdb/${PART}/pdb${PDBID}.ent.gz',
         re.compile(r'.*/pdb([\w]{4})\.ent(\.gz)?'),FILE,'MMCIF'))
-    docs.append(_create_databank('BDB','http://www.cmbi.ru.nl/bdb/',
-        'ftp://ftp.cmbi.ru.nl/pub/molbio/data/bdb/${PART}/${PDBID}/${PDBID}.bdb',
+    docs.append(_create_databank('BDB','http://www.cmbi.umcn.nl/bdb/',
+        'ftp://ftp.cmbi.umcn.nl/pub/molbio/data/bdb/${PART}/${PDBID}/${PDBID}.bdb',
         re.compile(r'.*/([\w]{4})\.bdb'),FILE,'PDB'))
-    docs.append(_create_databank('DSSP','http://swift.cmbi.ru.nl/gv/dssp/',
-        'ftp://ftp.cmbi.ru.nl/pub/molbio/data/dssp/${PDBID}.dssp',
+    docs.append(_create_databank('DSSP','http://swift.cmbi.umcn.nl/gv/dssp/',
+        'ftp://ftp.cmbi.umcn.nl/pub/molbio/data/dssp/${PDBID}.dssp',
         re.compile(r'.*/([\w]{4})\.dssp'),FILE,'MMCIF'))
-    docs.append(_create_databank('HSSP','http://swift.cmbi.ru.nl/gv/hssp/',
-        'ftp://ftp.cmbi.ru.nl/pub/molbio/data/hssp/${PDBID}.hssp.bz2',
+    docs.append(_create_databank('HSSP','http://swift.cmbi.umcn.nl/gv/hssp/',
+        'ftp://ftp.cmbi.umcn.nl/pub/molbio/data/hssp/${PDBID}.hssp.bz2',
         re.compile(r'.*/([\w]{4})\.hssp.bz2'),FILE,'DSSP'))
-    docs.append(_create_databank('PDBFINDER','http://swift.cmbi.ru.nl/gv/pdbfinder/',
-        'ftp://ftp.cmbi.ru.nl/pub/molbio/data/pdbfinder/PDBFIND.TXT.gz',
+    docs.append(_create_databank('PDBFINDER','http://swift.cmbi.umcn.nl/gv/pdbfinder/',
+        'ftp://ftp.cmbi.umcn.nl/pub/molbio/data/pdbfinder/PDBFIND.TXT.gz',
         re.compile(r'ID           : ([\w]{4})'),LINE,'HSSP'))
-    docs.append(_create_databank('PDBFINDER2','http://swift.cmbi.ru.nl/gv/pdbfinder/',
-        'ftp://ftp.cmbi.ru.nl/pub/molbio/data/pdbfinder2/PDBFIND2.TXT.gz',
+    docs.append(_create_databank('PDBFINDER2','http://swift.cmbi.umcn.nl/gv/pdbfinder/',
+        'ftp://ftp.cmbi.umcn.nl/pub/molbio/data/pdbfinder2/PDBFIND2.TXT.gz',
         re.compile(r'ID           : ([\w]{4})'),LINE,'PDBFINDER'))
     docs.append(_create_databank('NMR','http://www.bmrb.wisc.edu/',
         'ftp://ftp.wwpdb.org/pub/pdb/data/structures/all/nmr_restraints/${PDBID}.mr.gz',
@@ -62,32 +62,32 @@ def create_databanks():
     docs.append(_create_databank('STRUCTUREFACTORS','http://www.pdb.org/',
         'ftp://ftp.wwpdb.org/pub/pdb/data/structures/divided/structure_factors/${PART}/r${PDBID}sf.ent.gz',
         re.compile(r'.*/r([\w]{4})sf\.ent\.gz'),FILE,'MMCIF'))
-    docs.append(_create_databank('PDBREPORT','http://swift.cmbi.ru.nl/gv/pdbreport/',
-        'http://www.cmbi.ru.nl/pdbreport/cgi-bin/nonotes?PDBID=${PDBID}',
+    docs.append(_create_databank('PDBREPORT','http://swift.cmbi.umcn.nl/gv/pdbreport/',
+        'http://www.cmbi.umcn.nl/pdbreport/cgi-bin/nonotes?PDBID=${PDBID}',
         re.compile(r'pdbreport\/\w{2}\/(\w{4})\/pdbout\.txt'),FILE,'PDB'))
-    docs.append(_create_databank('PDB_REDO','http://www.cmbi.ru.nl/pdb_redo/',
-        'http://www.cmbi.ru.nl/pdb_redo/cgi-bin/redir2.pl?pdbCode=${PDBID}',
+    docs.append(_create_databank('PDB_REDO','http://www.cmbi.umcn.nl/pdb_redo/',
+        'http://www.cmbi.umcn.nl/pdb_redo/cgi-bin/redir2.pl?pdbCode=${PDBID}',
         re.compile(r'\/\w{2}\/\w{4}\/(\w{4})_final\.pdb'),FILE,'STRUCTUREFACTORS'))
-    docs.append(_create_databank('DSSP_REDO','http://swift.cmbi.ru.nl/gv/dssp/',
-        'ftp://ftp.cmbi.ru.nl/pub/molbio/data/dssp_redo/${PDBID}.dssp',
+    docs.append(_create_databank('DSSP_REDO','http://swift.cmbi.umcn.nl/gv/dssp/',
+        'ftp://ftp.cmbi.umcn.nl/pub/molbio/data/dssp_redo/${PDBID}.dssp',
         re.compile(r'.*/([\w]{4})\.dssp'),FILE,'PDB_REDO'))
 
     for lis in ['dsp','iod','sbh','sbr','ss1','ss2','tau','acc','cal','wat',
                 'cc1','cc2','cc3','chi']:
-        docs.append(_create_databank('WHATIF_PDB_%s' % lis, 'http://swift.cmbi.ru.nl/whatif/',
-            'ftp://ftp.cmbi.ru.nl/pub/molbio/data/wi-lists/pdb/%s/${PDBID}/${PDBID}.%s.bz2' % (lis, lis),
+        docs.append(_create_databank('WHATIF_PDB_%s' % lis, 'http://swift.cmbi.umcn.nl/whatif/',
+            'ftp://ftp.cmbi.umcn.nl/pub/molbio/data/wi-lists/pdb/%s/${PDBID}/${PDBID}.%s.bz2' % (lis, lis),
             re.compile(r'.*/([\w]{4})\.' + lis + r'(\.bz2)?$'),FILE,'PDB'))
-        docs.append(_create_databank('WHATIF_REDO_%s' % lis, 'http://swift.cmbi.ru.nl/whatif/',
-            'ftp://ftp.cmbi.ru.nl/pub/molbio/data/wi-lists/redo/%s/${PDBID}/${PDBID}.%s.bz2' % (lis, lis),
+        docs.append(_create_databank('WHATIF_REDO_%s' % lis, 'http://swift.cmbi.umcn.nl/whatif/',
+            'ftp://ftp.cmbi.umcn.nl/pub/molbio/data/wi-lists/redo/%s/${PDBID}/${PDBID}.%s.bz2' % (lis, lis),
             re.compile(r'.*/([\w]{4})\.' + lis + r'(\.bz2)?$'),FILE,'PDB_REDO'))
 
     scenames = { 'ss2': 'sym-contacts', 'iod': 'ion-sites'}
     for lis in scenames:
-        docs.append(_create_databank('PDB_SCENES_%s' % lis, 'http://www.cmbi.ru.nl/pdb-vis/',
-            'ftp://ftp.cmbi.ru.nl/pub/molbio/data/wi-lists/pdb/scenes/%s/${PDBID}/${PDBID}_%s.sce' % (lis, scenames[lis]),
+        docs.append(_create_databank('PDB_SCENES_%s' % lis, 'http://www.cmbi.umcn.nl/pdb-vis/',
+            'ftp://ftp.cmbi.umcn.nl/pub/molbio/data/wi-lists/pdb/scenes/%s/${PDBID}/${PDBID}_%s.sce' % (lis, scenames[lis]),
             re.compile(r'.*/([\w]{4})_' + scenames[lis] + r'\.sce'),FILE,'WHATIF_PDB_%s' % lis))
-        docs.append(_create_databank('REDO_SCENES_%s' % lis, 'http://www.cmbi.ru.nl/pdb-vis/',
-            'ftp://ftp.cmbi.ru.nl/pub/molbio/data/wi-lists/redo/scenes/%s/${PDBID}/${PDBID}_%s.sce' % (lis, scenames[lis]),
+        docs.append(_create_databank('REDO_SCENES_%s' % lis, 'http://www.cmbi.umcn.nl/pdb-vis/',
+            'ftp://ftp.cmbi.umcn.nl/pub/molbio/data/wi-lists/redo/scenes/%s/${PDBID}/${PDBID}_%s.sce' % (lis, scenames[lis]),
             re.compile(r'.*/([\w]{4})_' + scenames[lis] + r'\.sce'),FILE,'WHATIF_REDO_%s' % lis))
 
     return docs