Save the multiword file for Italian and then use it at runtime. Another alternative would have been to save it directly into the model

AngledLuffa · AngledLuffa · commit 5d33ab4dec76 · 2021-10-29T17:12:36.000-07:00
diff --git a/scripts/cdc-tokenize/Makefile b/scripts/cdc-tokenize/Makefile
@@ -9,10 +9,10 @@ HU_TEST_GOLD = /u/nlp/software/CoreNLP-models/hu/stattok/4.3.0/hu_szeged-ud-test
 
 # ignoring twittiro and postwita because this model gets thrown off
 # quite a lot by the non-standard sentence endings
-IT_TRAINING = /u/nlp/software/CoreNLP-models/it/stattok/4.3.0/it_isdt-ud-train.conllu,/u/nlp/software/CoreNLP-models/it/stattok/4.3.0/it_vit-ud-train.conllu,/u/nlp/software/CoreNLP-models/it/stattok/4.3.0/italian.mwt
+IT_TRAINING = /u/nlp/software/CoreNLP-models/it/stattok/4.3.1/it_isdt-ud-train.conllu,/u/nlp/software/CoreNLP-models/it/stattok/4.3.1/it_vit-ud-train.conllu,/u/nlp/software/CoreNLP-models/it/stattok/4.3.1/italian.mwt
 
-IT_TEST_INPUT = /u/nlp/software/CoreNLP-models/it/stattok/4.3.0/it_isdt-ud-test.txt
-IT_TEST_GOLD = /u/nlp/software/CoreNLP-models/it/stattok/4.3.0/it_isdt-ud-test.conllu
+IT_TEST_INPUT = /u/nlp/software/CoreNLP-models/it/stattok/4.3.1/it_isdt-ud-test.txt
+IT_TEST_GOLD = /u/nlp/software/CoreNLP-models/it/stattok/4.3.1/it_isdt-ud-test.conllu
 
 .SECONDEXPANSION:
 
@@ -29,8 +29,12 @@ hu-tokenizer.ser.gz:
 
 italian: it-tokenizer.ser.gz
 
-it-tokenizer.ser.gz:
+it-multiword.txt:
+	@echo Building $@
+	java edu.stanford.nlp.process.stattok.BuildMultiWordRules -trainFile $(IT_TRAINING) -multiWordRulesFile $@
+
+it-tokenizer.ser.gz: it-multiword.txt
 	@echo Training $@
-	java edu.stanford.nlp.process.stattok.StatTokSentTrainer -inferMultiWordRules 1 -trainFile $(IT_TRAINING) -serializeTo $@
-	java edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators cdc_tokenize -cdc_tokenize.model $@ -file $(IT_TEST_INPUT) -outputFormat conllu -output.printFakeDeps True -outputDirectory /tmp/$@.out
+	java edu.stanford.nlp.process.stattok.StatTokSentTrainer -trainFile $(IT_TRAINING) -multiWordRulesFile $< -serializeTo $@
+	java edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators cdc_tokenize -cdc_tokenize.model $@ -cdc_tokenize.multiWordRules $< -file $(IT_TEST_INPUT) -outputFormat conllu -output.printFakeDeps True -outputDirectory /tmp/$@.out
 	$(PYTHON) $(EVAL_SCRIPT) -v $(IT_TEST_GOLD) /tmp/$@.out/$(notdir $(IT_TEST_INPUT)).conllu
diff --git a/src/edu/stanford/nlp/pipeline/StanfordCoreNLP-italian.properties b/src/edu/stanford/nlp/pipeline/StanfordCoreNLP-italian.properties
@@ -3,7 +3,8 @@ annotators: cdc_tokenize, pos, depparse, parse, ner
 
 
 # tokenize - statistical model
-cdc_tokenize.model = edu/stanford/nlp/models/cdc-tokenize/it-tokenizer.ser.gz
+cdc_tokenize.model          = edu/stanford/nlp/models/cdc-tokenize/it-tokenizer.ser.gz
+cdc_tokenize.multiWordRules = edu/stanford/nlp/models/cdc-tokenize/it-multiword.txt
 
 # pos
 pos.model = edu/stanford/nlp/models/pos-tagger/italian.tagger
diff --git a/src/edu/stanford/nlp/process/stattok/BuildMultiWordRules.java b/src/edu/stanford/nlp/process/stattok/BuildMultiWordRules.java
@@ -0,0 +1,38 @@
+package edu.stanford.nlp.process.stattok;
+
+/**
+ * Builds a MultiWordRules table for the StatTokSent splitter.
+ */
+
+import java.io.IOException;
+import java.util.Map;
+import java.util.Properties;
+
+import edu.stanford.nlp.util.StringUtils;
+import edu.stanford.nlp.util.logging.Redwood;
+
+public class BuildMultiWordRules {
+  private static final Redwood.RedwoodChannels logger = Redwood.channels(StatTokSentTrainer.class);
+
+  // disallow making this class
+  private BuildMultiWordRules() {}
+
+  public static void main(String[] args) throws IOException {
+    Properties properties 	= StringUtils.argsToProperties(args);
+    String trainFile 		= properties.getProperty("trainFile", null);
+    String multiWordRulesFile 	= properties.getProperty("multiWordRulesFile", null);
+
+    if (trainFile == null){
+      logger.err("Error: No training file provided in properties or via command line --trainFile");
+      return;
+    }
+
+    if (multiWordRulesFile == null){
+      logger.err("Error: No dest file provided in properties or via command line --multiWordRulesFile");
+      return;
+    }
+
+    Map<String, String[]> rules = StatTokSentTrainer.inferMultiWordRules(trainFile);
+    StatTokSentTrainer.writeMultiWordRules(multiWordRulesFile, rules);
+  }
+}
diff --git a/src/edu/stanford/nlp/process/stattok/StatTokSentTrainer.java b/src/edu/stanford/nlp/process/stattok/StatTokSentTrainer.java
@@ -61,7 +61,6 @@ public StatTokSentTrainer(String[] propertiesArguments){
    * This method generates the training set for the classifier given a CoNLL-U formatted training set and set of multi-word rules (either generated from the training or written in a file).
    */
   public ArrayList<Pair<String, String>> fileToTrainSet(String trainFile, Map<String, String[]> multiWordRules)throws IOException, FileNotFoundException{
-
     ArrayList<Pair<String, String>> classChars = new ArrayList<Pair<String, String>>();
 
     for (String filename : trainFile.split("[,;]")) {
@@ -310,7 +309,7 @@ public List<String> addFeatures(ArrayList<Pair<String, String>> classCharsText,
   /**
    * Method to read multi-word token rules from a file.
    */
-  private Map<String, String[]> readMultiWordRules(String multiWordRulesFile){
+  public static Map<String, String[]> readMultiWordRules(String multiWordRulesFile) throws IOException {
     Map<String, String[]> multiWordRules = new HashMap<String, String[]>();
     // buffered and decoded from utf-8
     try (BufferedReader reader = IOUtils.readerFromString(multiWordRulesFile)) {
@@ -321,16 +320,27 @@ private Map<String, String[]> readMultiWordRules(String multiWordRulesFile){
         String[] tokenComponents = parts[1].split(",");
         multiWordRules.put(token, tokenComponents);
       }
-    } catch (Exception e) {
-      e.printStackTrace();
     }
     return multiWordRules;
   }
 
+  public static void writeMultiWordRules(String multiWordRulesFile, Map<String, String[]> rules) throws IOException {
+    List<String> keys = new ArrayList<>(rules.keySet());
+    Collections.sort(keys);
+    try (BufferedWriter bw = new BufferedWriter(new FileWriter(multiWordRulesFile))) {
+      for (String key : keys) {
+        bw.write(key);
+        bw.write("\t");
+        bw.write(String.join(",", rules.get(key)));
+        bw.write("\n");
+      }
+    }
+  }
+
   /**
    * Method to infer multi-word token rules directly from the training set for tokenization.
    */
-  private Map<String, String[]> inferMultiWordRules (String trainFile) throws IOException, FileNotFoundException {
+  public static Map<String, String[]> inferMultiWordRules (String trainFile) throws IOException, FileNotFoundException {
     Map<String, String[]> multiWordRules = new HashMap<String, String[]>();
     for (String filename : trainFile.split("[,;]")) {
       try (BufferedReader reader = IOUtils.readerFromString(filename)) {