glample · Khalife · Dec 28, 2016 · Mar 16, 2017 · glample · Jan 1, 2018
diff --git a/loader.py b/loader.py
@@ -61,9 +61,7 @@ def word_mapping(sentences, lower):
     dico = create_dico(words)
     dico['<UNK>'] = 10000000
     word_to_id, id_to_word = create_mapping(dico)
-    print "Found %i unique words (%i in total)" % (
-        len(dico), sum(len(x) for x in words)
-    )
+    print("Found %i unique words (%i in total)" % ( len(dico), sum(len(x) for x in words) ))
     return dico, word_to_id, id_to_word
 
 
@@ -74,7 +72,7 @@ def char_mapping(sentences):
     chars = ["".join([w[0] for w in s]) for s in sentences]
     dico = create_dico(chars)
     char_to_id, id_to_char = create_mapping(dico)
-    print "Found %i unique characters" % len(dico)
+    print("Found %i unique characters" % len(dico) )
     return dico, char_to_id, id_to_char
 
 
@@ -85,7 +83,7 @@ def tag_mapping(sentences):
     tags = [[word[-1] for word in s] for s in sentences]
     dico = create_dico(tags)
     tag_to_id, id_to_tag = create_mapping(dico)
-    print "Found %i unique named entity tags" % len(dico)
+    print("Found %i unique named entity tags" % len(dico) )
     return dico, tag_to_id, id_to_tag
 
 
@@ -160,7 +158,7 @@ def augment_with_pretrained(dictionary, ext_emb_path, words):
     to the dictionary, otherwise, we only add the words that are given by
     `words` (typically the words in the development and test sets.)
     """
-    print 'Loading pretrained embeddings from %s...' % ext_emb_path
+    print('Loading pretrained embeddings from %s...' % ext_emb_path)
     assert os.path.isfile(ext_emb_path)
 
     # Load pretrained embeddings from file
@@ -188,3 +186,6 @@ def augment_with_pretrained(dictionary, ext_emb_path, words):
 
     word_to_id, id_to_word = create_mapping(dictionary)
     return dictionary, word_to_id, id_to_word
+
+if __name__ == "__main__":
+    print("testMain")
diff --git a/model.py b/model.py
@@ -5,7 +5,7 @@
 import theano
 import theano.tensor as T
 import codecs
-import cPickle
+import _pickle as cPickle
 
 from utils import shared, set_values, get_name
 from nn import HiddenLayer, EmbeddingLayer, DropoutLayer, LSTM, forward
@@ -163,7 +163,7 @@ def build(self,
             # Initialize with pretrained embeddings
             if pre_emb and training:
                 new_weights = word_layer.embeddings.get_value()
-                print 'Loading pretrained embeddings from %s...' % pre_emb
+                print('Loading pretrained embeddings from %s...' % pre_emb)
                 pretrained = {}
                 emb_invalid = 0
                 for i, line in enumerate(codecs.open(pre_emb, 'r', 'utf-8')):
@@ -175,7 +175,7 @@ def build(self,
                     else:
                         emb_invalid += 1
                 if emb_invalid > 0:
-                    print 'WARNING: %i invalid lines' % emb_invalid
+                    print('WARNING: %i invalid lines' % emb_invalid)
                 c_found = 0
                 c_lower = 0
                 c_zeros = 0
@@ -194,7 +194,7 @@ def build(self,
                         ]
                         c_zeros += 1
                 word_layer.embeddings.set_value(new_weights)
-                print 'Loaded %i pretrained embeddings.' % len(pretrained)
+                print('Loaded %i pretrained embeddings.' % len(pretrained))
                 print ('%i / %i (%.4f%%) words have been initialized with '
                        'pretrained embeddings.') % (
                             c_found + c_lower + c_zeros, n_words,
@@ -368,7 +368,7 @@ def build(self,
             lr_method_parameters = {}
 
         # Compile training function
-        print 'Compiling...'
+        print('Compiling...')
         if training:
             updates = Optimization(clip=5.0).get_updates(lr_method_name, cost, params, **lr_method_parameters)
             f_train = theano.function(

diff --git a/tagger.py b/tagger.py
@@ -8,6 +8,7 @@
 from loader import prepare_sentence
 from utils import create_input, iobes_iob, zero_digits
 from model import Model
+import pdb
 
 optparser = optparse.OptionParser()
 optparser.add_option(
@@ -34,7 +35,7 @@
 assert os.path.isfile(opts.input)
 
 # Load existing model
-print "Loading model..."
+print("Loading model...")
 model = Model(model_path=opts.model)
 parameters = model.parameters
 
@@ -51,12 +52,12 @@
 f_output = codecs.open(opts.output, 'w', 'utf-8')
 start = time.time()
 
-print 'Tagging...'
+print('Tagging...')
 with codecs.open(opts.input, 'r', 'utf-8') as f_input:
     count = 0
     for line in f_input:
         words = line.rstrip().split()
-        if line:
+        if len(line.strip()):
             # Lowercase sentence
             if parameters['lower']:
                 line = line.lower()
@@ -84,7 +85,7 @@
             f_output.write('\n')
         count += 1
         if count % 100 == 0:
-            print count
+            print(count)
 
-print '---- %i lines tagged in %.4fs ----' % (count, time.time() - start)
+print('---- %i lines tagged in %.4fs ----' % (count, time.time() - start))
 f_output.close()
diff --git a/train.py b/train.py
@@ -133,7 +133,7 @@
 
 # Initialize model
 model = Model(parameters=parameters, models_path=models_path)
-print "Model location: %s" % model.model_path
+print("Model location: %s" % model.model_path)
 
 # Data parameters
 lower = parameters['lower']
@@ -180,19 +180,19 @@
     test_sentences, word_to_id, char_to_id, tag_to_id, lower
 )
 
-print "%i / %i / %i sentences in train / dev / test." % (
-    len(train_data), len(dev_data), len(test_data))
+print("%i / %i / %i sentences in train / dev / test." % (
+    len(train_data), len(dev_data), len(test_data)))
 
 # Save the mappings to disk
-print 'Saving the mappings to disk...'
+print('Saving the mappings to disk...')
 model.save_mappings(id_to_word, id_to_char, id_to_tag)
 
 # Build the model
 f_train, f_eval = model.build(**parameters)
 
 # Reload previous model values
 if opts.reload:
-    print 'Reloading previous model...'
+    print('Reloading previous model...')
     model.reload()
 
 #
@@ -207,27 +207,27 @@
 count = 0
 for epoch in xrange(n_epochs):
     epoch_costs = []
-    print "Starting epoch %i..." % epoch
+    print("Starting epoch %i..." % epoch)
     for i, index in enumerate(np.random.permutation(len(train_data))):
         count += 1
         input = create_input(train_data[index], parameters, True, singletons)
         new_cost = f_train(*input)
         epoch_costs.append(new_cost)
         if i % 50 == 0 and i > 0 == 0:
-            print "%i, cost average: %f" % (i, np.mean(epoch_costs[-50:]))
+            print("%i, cost average: %f" % (i, np.mean(epoch_costs[-50:])))
         if count % freq_eval == 0:
             dev_score = evaluate(parameters, f_eval, dev_sentences,
                                  dev_data, id_to_tag, dico_tags)
             test_score = evaluate(parameters, f_eval, test_sentences,
                                   test_data, id_to_tag, dico_tags)
-            print "Score on dev: %.5f" % dev_score
-            print "Score on test: %.5f" % test_score
+            print("Score on dev: %.5f" % dev_score)
+            print("Score on test: %.5f" % test_score)
             if dev_score > best_dev:
                 best_dev = dev_score
-                print "New best score on dev."
-                print "Saving model to disk..."
+                print("New best score on dev.")
+                print("Saving model to disk...")
                 model.save()
             if test_score > best_test:
                 best_test = test_score
-                print "New best score on test."
-    print "Epoch %i done. Average cost: %f" % (epoch, np.mean(epoch_costs))
+                print("New best score on test.")
+    print("Epoch %i done. Average cost: %f" % (epoch, np.mean(epoch_costs)))
diff --git a/utils.py b/utils.py
@@ -3,7 +3,7 @@
 import codecs
 import numpy as np
 import theano
-
+import pdb
 
 models_path = "./models"
 eval_path = "./evaluation"
@@ -177,6 +177,8 @@ def pad_word_chars(words):
         - padded list of lists of ints (where chars are reversed)
         - list of ints corresponding to the index of the last character of each word
     """
+    print(words)
+    print('\n')
     max_length = max([len(word) for word in words])
     char_for = []
     char_rev = []
@@ -224,7 +226,6 @@ def evaluate(parameters, f_eval, raw_sentences, parsed_sentences,
     n_tags = len(id_to_tag)
     predictions = []
     count = np.zeros((n_tags, n_tags), dtype=np.int32)
-
     for raw_sentence, data in zip(raw_sentences, parsed_sentences):
         input = create_input(data, parameters, False)
         if parameters['crf']:
@@ -255,28 +256,28 @@ def evaluate(parameters, f_eval, raw_sentences, parsed_sentences,
     # CoNLL evaluation results
     eval_lines = [l.rstrip() for l in codecs.open(scores_path, 'r', 'utf8')]
     for line in eval_lines:
-        print line
+        print(line)
 
     # Remove temp files
     # os.remove(output_path)
     # os.remove(scores_path)
 
     # Confusion matrix with accuracy for each tag
-    print ("{: >2}{: >7}{: >7}%s{: >9}" % ("{: >7}" * n_tags)).format(
+    print( ("{: >2}{: >7}{: >7}%s{: >9}" % ("{: >7}" * n_tags)).format(
         "ID", "NE", "Total",
         *([id_to_tag[i] for i in xrange(n_tags)] + ["Percent"])
-    )
+    ))
     for i in xrange(n_tags):
-        print ("{: >2}{: >7}{: >7}%s{: >9}" % ("{: >7}" * n_tags)).format(
+        print( ("{: >2}{: >7}{: >7}%s{: >9}" % ("{: >7}" * n_tags)).format(
             str(i), id_to_tag[i], str(count[i].sum()),
             *([count[i][j] for j in xrange(n_tags)] +
               ["%.3f" % (count[i][i] * 100. / max(1, count[i].sum()))])
-        )
+        ))
 
     # Global accuracy
-    print "%i/%i (%.5f%%)" % (
+    print( "%i/%i (%.5f%%)" % (
         count.trace(), count.sum(), 100. * count.trace() / max(1, count.sum())
-    )
+    ))
 
     # F1 on all entities
     return float(eval_lines[1].strip().split()[-1])