Skip to content

Commit 5d33ab4

Browse files
committed
Save the multiword file for Italian and then use it at runtime. Another alternative would have been to save it directly into the model
1 parent a53d57c commit 5d33ab4

File tree

4 files changed

+65
-12
lines changed

4 files changed

+65
-12
lines changed

scripts/cdc-tokenize/Makefile

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,10 @@ HU_TEST_GOLD = /u/nlp/software/CoreNLP-models/hu/stattok/4.3.0/hu_szeged-ud-test
99

1010
# ignoring twittiro and postwita because this model gets thrown off
1111
# quite a lot by the non-standard sentence endings
12-
IT_TRAINING = /u/nlp/software/CoreNLP-models/it/stattok/4.3.0/it_isdt-ud-train.conllu,/u/nlp/software/CoreNLP-models/it/stattok/4.3.0/it_vit-ud-train.conllu,/u/nlp/software/CoreNLP-models/it/stattok/4.3.0/italian.mwt
12+
IT_TRAINING = /u/nlp/software/CoreNLP-models/it/stattok/4.3.1/it_isdt-ud-train.conllu,/u/nlp/software/CoreNLP-models/it/stattok/4.3.1/it_vit-ud-train.conllu,/u/nlp/software/CoreNLP-models/it/stattok/4.3.1/italian.mwt
1313

14-
IT_TEST_INPUT = /u/nlp/software/CoreNLP-models/it/stattok/4.3.0/it_isdt-ud-test.txt
15-
IT_TEST_GOLD = /u/nlp/software/CoreNLP-models/it/stattok/4.3.0/it_isdt-ud-test.conllu
14+
IT_TEST_INPUT = /u/nlp/software/CoreNLP-models/it/stattok/4.3.1/it_isdt-ud-test.txt
15+
IT_TEST_GOLD = /u/nlp/software/CoreNLP-models/it/stattok/4.3.1/it_isdt-ud-test.conllu
1616

1717
.SECONDEXPANSION:
1818

@@ -29,8 +29,12 @@ hu-tokenizer.ser.gz:
2929

3030
italian: it-tokenizer.ser.gz
3131

32-
it-tokenizer.ser.gz:
32+
it-multiword.txt:
33+
@echo Building $@
34+
java edu.stanford.nlp.process.stattok.BuildMultiWordRules -trainFile $(IT_TRAINING) -multiWordRulesFile $@
35+
36+
it-tokenizer.ser.gz: it-multiword.txt
3337
@echo Training $@
34-
java edu.stanford.nlp.process.stattok.StatTokSentTrainer -inferMultiWordRules 1 -trainFile $(IT_TRAINING) -serializeTo $@
35-
java edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators cdc_tokenize -cdc_tokenize.model $@ -file $(IT_TEST_INPUT) -outputFormat conllu -output.printFakeDeps True -outputDirectory /tmp/$@.out
38+
java edu.stanford.nlp.process.stattok.StatTokSentTrainer -trainFile $(IT_TRAINING) -multiWordRulesFile $< -serializeTo $@
39+
java edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators cdc_tokenize -cdc_tokenize.model $@ -cdc_tokenize.multiWordRules $< -file $(IT_TEST_INPUT) -outputFormat conllu -output.printFakeDeps True -outputDirectory /tmp/$@.out
3640
$(PYTHON) $(EVAL_SCRIPT) -v $(IT_TEST_GOLD) /tmp/$@.out/$(notdir $(IT_TEST_INPUT)).conllu

src/edu/stanford/nlp/pipeline/StanfordCoreNLP-italian.properties

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@ annotators: cdc_tokenize, pos, depparse, parse, ner
33

44

55
# tokenize - statistical model
6-
cdc_tokenize.model = edu/stanford/nlp/models/cdc-tokenize/it-tokenizer.ser.gz
6+
cdc_tokenize.model = edu/stanford/nlp/models/cdc-tokenize/it-tokenizer.ser.gz
7+
cdc_tokenize.multiWordRules = edu/stanford/nlp/models/cdc-tokenize/it-multiword.txt
78

89
# pos
910
pos.model = edu/stanford/nlp/models/pos-tagger/italian.tagger
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
package edu.stanford.nlp.process.stattok;
2+
3+
/**
4+
* Builds a MultiWordRules table for the StatTokSent splitter.
5+
*/
6+
7+
import java.io.IOException;
8+
import java.util.Map;
9+
import java.util.Properties;
10+
11+
import edu.stanford.nlp.util.StringUtils;
12+
import edu.stanford.nlp.util.logging.Redwood;
13+
14+
public class BuildMultiWordRules {
15+
private static final Redwood.RedwoodChannels logger = Redwood.channels(StatTokSentTrainer.class);
16+
17+
// disallow making this class
18+
private BuildMultiWordRules() {}
19+
20+
public static void main(String[] args) throws IOException {
21+
Properties properties = StringUtils.argsToProperties(args);
22+
String trainFile = properties.getProperty("trainFile", null);
23+
String multiWordRulesFile = properties.getProperty("multiWordRulesFile", null);
24+
25+
if (trainFile == null){
26+
logger.err("Error: No training file provided in properties or via command line --trainFile");
27+
return;
28+
}
29+
30+
if (multiWordRulesFile == null){
31+
logger.err("Error: No dest file provided in properties or via command line --multiWordRulesFile");
32+
return;
33+
}
34+
35+
Map<String, String[]> rules = StatTokSentTrainer.inferMultiWordRules(trainFile);
36+
StatTokSentTrainer.writeMultiWordRules(multiWordRulesFile, rules);
37+
}
38+
}

src/edu/stanford/nlp/process/stattok/StatTokSentTrainer.java

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,6 @@ public StatTokSentTrainer(String[] propertiesArguments){
6161
* This method generates the training set for the classifier given a CoNLL-U formatted training set and set of multi-word rules (either generated from the training or written in a file).
6262
*/
6363
public ArrayList<Pair<String, String>> fileToTrainSet(String trainFile, Map<String, String[]> multiWordRules)throws IOException, FileNotFoundException{
64-
6564
ArrayList<Pair<String, String>> classChars = new ArrayList<Pair<String, String>>();
6665

6766
for (String filename : trainFile.split("[,;]")) {
@@ -310,7 +309,7 @@ public List<String> addFeatures(ArrayList<Pair<String, String>> classCharsText,
310309
/**
311310
* Method to read multi-word token rules from a file.
312311
*/
313-
private Map<String, String[]> readMultiWordRules(String multiWordRulesFile){
312+
public static Map<String, String[]> readMultiWordRules(String multiWordRulesFile) throws IOException {
314313
Map<String, String[]> multiWordRules = new HashMap<String, String[]>();
315314
// buffered and decoded from utf-8
316315
try (BufferedReader reader = IOUtils.readerFromString(multiWordRulesFile)) {
@@ -321,16 +320,27 @@ private Map<String, String[]> readMultiWordRules(String multiWordRulesFile){
321320
String[] tokenComponents = parts[1].split(",");
322321
multiWordRules.put(token, tokenComponents);
323322
}
324-
} catch (Exception e) {
325-
e.printStackTrace();
326323
}
327324
return multiWordRules;
328325
}
329326

327+
public static void writeMultiWordRules(String multiWordRulesFile, Map<String, String[]> rules) throws IOException {
328+
List<String> keys = new ArrayList<>(rules.keySet());
329+
Collections.sort(keys);
330+
try (BufferedWriter bw = new BufferedWriter(new FileWriter(multiWordRulesFile))) {
331+
for (String key : keys) {
332+
bw.write(key);
333+
bw.write("\t");
334+
bw.write(String.join(",", rules.get(key)));
335+
bw.write("\n");
336+
}
337+
}
338+
}
339+
330340
/**
331341
* Method to infer multi-word token rules directly from the training set for tokenization.
332342
*/
333-
private Map<String, String[]> inferMultiWordRules (String trainFile) throws IOException, FileNotFoundException {
343+
public static Map<String, String[]> inferMultiWordRules (String trainFile) throws IOException, FileNotFoundException {
334344
Map<String, String[]> multiWordRules = new HashMap<String, String[]>();
335345
for (String filename : trainFile.split("[,;]")) {
336346
try (BufferedReader reader = IOUtils.readerFromString(filename)) {

0 commit comments

Comments
 (0)