Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 7 additions & 6 deletions loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,7 @@ def word_mapping(sentences, lower):
dico = create_dico(words)
dico['<UNK>'] = 10000000
word_to_id, id_to_word = create_mapping(dico)
print "Found %i unique words (%i in total)" % (
len(dico), sum(len(x) for x in words)
)
print("Found %i unique words (%i in total)" % ( len(dico), sum(len(x) for x in words) ))
return dico, word_to_id, id_to_word


Expand All @@ -74,7 +72,7 @@ def char_mapping(sentences):
chars = ["".join([w[0] for w in s]) for s in sentences]
dico = create_dico(chars)
char_to_id, id_to_char = create_mapping(dico)
print "Found %i unique characters" % len(dico)
print("Found %i unique characters" % len(dico) )
return dico, char_to_id, id_to_char


Expand All @@ -85,7 +83,7 @@ def tag_mapping(sentences):
tags = [[word[-1] for word in s] for s in sentences]
dico = create_dico(tags)
tag_to_id, id_to_tag = create_mapping(dico)
print "Found %i unique named entity tags" % len(dico)
print("Found %i unique named entity tags" % len(dico) )
return dico, tag_to_id, id_to_tag


Expand Down Expand Up @@ -160,7 +158,7 @@ def augment_with_pretrained(dictionary, ext_emb_path, words):
to the dictionary, otherwise, we only add the words that are given by
`words` (typically the words in the development and test sets.)
"""
print 'Loading pretrained embeddings from %s...' % ext_emb_path
print('Loading pretrained embeddings from %s...' % ext_emb_path)
assert os.path.isfile(ext_emb_path)

# Load pretrained embeddings from file
Expand Down Expand Up @@ -188,3 +186,6 @@ def augment_with_pretrained(dictionary, ext_emb_path, words):

word_to_id, id_to_word = create_mapping(dictionary)
return dictionary, word_to_id, id_to_word

if __name__ == "__main__":
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do you mind removing this too?

print("testMain")
10 changes: 5 additions & 5 deletions model.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import theano
import theano.tensor as T
import codecs
import cPickle
import _pickle as cPickle

from utils import shared, set_values, get_name
from nn import HiddenLayer, EmbeddingLayer, DropoutLayer, LSTM, forward
Expand Down Expand Up @@ -163,7 +163,7 @@ def build(self,
# Initialize with pretrained embeddings
if pre_emb and training:
new_weights = word_layer.embeddings.get_value()
print 'Loading pretrained embeddings from %s...' % pre_emb
print('Loading pretrained embeddings from %s...' % pre_emb)
pretrained = {}
emb_invalid = 0
for i, line in enumerate(codecs.open(pre_emb, 'r', 'utf-8')):
Expand All @@ -175,7 +175,7 @@ def build(self,
else:
emb_invalid += 1
if emb_invalid > 0:
print 'WARNING: %i invalid lines' % emb_invalid
print('WARNING: %i invalid lines' % emb_invalid)
c_found = 0
c_lower = 0
c_zeros = 0
Expand All @@ -194,7 +194,7 @@ def build(self,
]
c_zeros += 1
word_layer.embeddings.set_value(new_weights)
print 'Loaded %i pretrained embeddings.' % len(pretrained)
print('Loaded %i pretrained embeddings.' % len(pretrained))
print ('%i / %i (%.4f%%) words have been initialized with '
'pretrained embeddings.') % (
c_found + c_lower + c_zeros, n_words,
Expand Down Expand Up @@ -368,7 +368,7 @@ def build(self,
lr_method_parameters = {}

# Compile training function
print 'Compiling...'
print('Compiling...')
if training:
updates = Optimization(clip=5.0).get_updates(lr_method_name, cost, params, **lr_method_parameters)
f_train = theano.function(
Expand Down
11 changes: 6 additions & 5 deletions tagger.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from loader import prepare_sentence
from utils import create_input, iobes_iob, zero_digits
from model import Model
import pdb
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you remove this?


optparser = optparse.OptionParser()
optparser.add_option(
Expand All @@ -34,7 +35,7 @@
assert os.path.isfile(opts.input)

# Load existing model
print "Loading model..."
print("Loading model...")
model = Model(model_path=opts.model)
parameters = model.parameters

Expand All @@ -51,12 +52,12 @@
f_output = codecs.open(opts.output, 'w', 'utf-8')
start = time.time()

print 'Tagging...'
print('Tagging...')
with codecs.open(opts.input, 'r', 'utf-8') as f_input:
count = 0
for line in f_input:
words = line.rstrip().split()
if line:
if len(line.strip()):
# Lowercase sentence
if parameters['lower']:
line = line.lower()
Expand Down Expand Up @@ -84,7 +85,7 @@
f_output.write('\n')
count += 1
if count % 100 == 0:
print count
print(count)

print '---- %i lines tagged in %.4fs ----' % (count, time.time() - start)
print('---- %i lines tagged in %.4fs ----' % (count, time.time() - start))
f_output.close()
26 changes: 13 additions & 13 deletions train.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@

# Initialize model
model = Model(parameters=parameters, models_path=models_path)
print "Model location: %s" % model.model_path
print("Model location: %s" % model.model_path)

# Data parameters
lower = parameters['lower']
Expand Down Expand Up @@ -180,19 +180,19 @@
test_sentences, word_to_id, char_to_id, tag_to_id, lower
)

print "%i / %i / %i sentences in train / dev / test." % (
len(train_data), len(dev_data), len(test_data))
print("%i / %i / %i sentences in train / dev / test." % (
len(train_data), len(dev_data), len(test_data)))

# Save the mappings to disk
print 'Saving the mappings to disk...'
print('Saving the mappings to disk...')
model.save_mappings(id_to_word, id_to_char, id_to_tag)

# Build the model
f_train, f_eval = model.build(**parameters)

# Reload previous model values
if opts.reload:
print 'Reloading previous model...'
print('Reloading previous model...')
model.reload()

#
Expand All @@ -207,27 +207,27 @@
count = 0
for epoch in xrange(n_epochs):
epoch_costs = []
print "Starting epoch %i..." % epoch
print("Starting epoch %i..." % epoch)
for i, index in enumerate(np.random.permutation(len(train_data))):
count += 1
input = create_input(train_data[index], parameters, True, singletons)
new_cost = f_train(*input)
epoch_costs.append(new_cost)
if i % 50 == 0 and i > 0 == 0:
print "%i, cost average: %f" % (i, np.mean(epoch_costs[-50:]))
print("%i, cost average: %f" % (i, np.mean(epoch_costs[-50:])))
if count % freq_eval == 0:
dev_score = evaluate(parameters, f_eval, dev_sentences,
dev_data, id_to_tag, dico_tags)
test_score = evaluate(parameters, f_eval, test_sentences,
test_data, id_to_tag, dico_tags)
print "Score on dev: %.5f" % dev_score
print "Score on test: %.5f" % test_score
print("Score on dev: %.5f" % dev_score)
print("Score on test: %.5f" % test_score)
if dev_score > best_dev:
best_dev = dev_score
print "New best score on dev."
print "Saving model to disk..."
print("New best score on dev.")
print("Saving model to disk...")
model.save()
if test_score > best_test:
best_test = test_score
print "New best score on test."
print "Epoch %i done. Average cost: %f" % (epoch, np.mean(epoch_costs))
print("New best score on test.")
print("Epoch %i done. Average cost: %f" % (epoch, np.mean(epoch_costs)))
19 changes: 10 additions & 9 deletions utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import codecs
import numpy as np
import theano

import pdb
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could you remove this? as well as the print(words) and print('\n') below?


models_path = "./models"
eval_path = "./evaluation"
Expand Down Expand Up @@ -177,6 +177,8 @@ def pad_word_chars(words):
- padded list of lists of ints (where chars are reversed)
- list of ints corresponding to the index of the last character of each word
"""
print(words)
print('\n')
max_length = max([len(word) for word in words])
char_for = []
char_rev = []
Expand Down Expand Up @@ -224,7 +226,6 @@ def evaluate(parameters, f_eval, raw_sentences, parsed_sentences,
n_tags = len(id_to_tag)
predictions = []
count = np.zeros((n_tags, n_tags), dtype=np.int32)

for raw_sentence, data in zip(raw_sentences, parsed_sentences):
input = create_input(data, parameters, False)
if parameters['crf']:
Expand Down Expand Up @@ -255,28 +256,28 @@ def evaluate(parameters, f_eval, raw_sentences, parsed_sentences,
# CoNLL evaluation results
eval_lines = [l.rstrip() for l in codecs.open(scores_path, 'r', 'utf8')]
for line in eval_lines:
print line
print(line)

# Remove temp files
# os.remove(output_path)
# os.remove(scores_path)

# Confusion matrix with accuracy for each tag
print ("{: >2}{: >7}{: >7}%s{: >9}" % ("{: >7}" * n_tags)).format(
print( ("{: >2}{: >7}{: >7}%s{: >9}" % ("{: >7}" * n_tags)).format(
"ID", "NE", "Total",
*([id_to_tag[i] for i in xrange(n_tags)] + ["Percent"])
)
))
for i in xrange(n_tags):
print ("{: >2}{: >7}{: >7}%s{: >9}" % ("{: >7}" * n_tags)).format(
print( ("{: >2}{: >7}{: >7}%s{: >9}" % ("{: >7}" * n_tags)).format(
str(i), id_to_tag[i], str(count[i].sum()),
*([count[i][j] for j in xrange(n_tags)] +
["%.3f" % (count[i][i] * 100. / max(1, count[i].sum()))])
)
))

# Global accuracy
print "%i/%i (%.5f%%)" % (
print( "%i/%i (%.5f%%)" % (
count.trace(), count.sum(), 100. * count.trace() / max(1, count.sum())
)
))

# F1 on all entities
return float(eval_lines[1].strip().split()[-1])