Add comments=true as an option to the TSV reader in MaxentTagger. Allows universaldependencies files to be used directly.

AngledLuffa · AngledLuffa · commit 07a320bab1f4 · 2021-04-21T00:07:05.000-07:00
diff --git a/src/edu/stanford/nlp/tagger/io/TSVTaggedFileReader.java b/src/edu/stanford/nlp/tagger/io/TSVTaggedFileReader.java
@@ -15,6 +15,7 @@ public class TSVTaggedFileReader implements TaggedFileReader {
   private final BufferedReader reader;
   private final String filename;
   private final int wordColumn, tagColumn;
+  private final boolean usesComments;
   private List<TaggedWord> next; // = null;
   private int linesRead; // = 0;
 
@@ -34,6 +35,7 @@ public TSVTaggedFileReader(TaggedFileRecord record) {
                   DEFAULT_WORD_COLUMN : record.wordColumn);
     tagColumn = ((record.tagColumn == null) ?
                  DEFAULT_TAG_COLUMN : record.tagColumn);
+    usesComments = record.usesComments;
     primeNext();
   }
 
@@ -58,9 +60,9 @@ public List<TaggedWord> next() {
 
 
   private void primeNext() {
-    // eat all blank lines until we hit the next block of text
+    // eat all blank lines (and maybe comments) until we hit the next block of text
     String line = "";
-    while (line.trim().isEmpty()) {
+    while (line.trim().isEmpty() || (usesComments && line.startsWith("#"))) {
       try {
         line = reader.readLine();
         ++linesRead;
@@ -77,14 +79,16 @@ private void primeNext() {
     // ends the sentence.
     next = new ArrayList<>();
     while (line != null && ! line.trim().isEmpty()) {
-      String[] pieces = line.split("\t");
-      if (pieces.length <= wordColumn || pieces.length <= tagColumn) {
-        throw new IllegalArgumentException("File " + filename + " line #" +
-                                           linesRead + " too short");
+      if (!(usesComments && line.startsWith("#"))) {
+        String[] pieces = line.split("\t");
+        if (pieces.length <= wordColumn || pieces.length <= tagColumn) {
+          throw new IllegalArgumentException("File " + filename + " line #" +
+                                             linesRead + " too short");
+        }
+        String word = pieces[wordColumn];
+        String tag = pieces[tagColumn];
+        next.add(new TaggedWord(word, tag));
       }
-      String word = pieces[wordColumn];
-      String tag = pieces[tagColumn];
-      next.add(new TaggedWord(word, tag));
       try {
         line = reader.readLine();
         ++linesRead;
@@ -97,4 +101,4 @@ private void primeNext() {
   @Override
   public void remove() { throw new UnsupportedOperationException(); }
 
-}
+}
diff --git a/src/edu/stanford/nlp/tagger/io/TaggedFileRecord.java b/src/edu/stanford/nlp/tagger/io/TaggedFileRecord.java
@@ -37,6 +37,7 @@ public enum Format {
   final Integer wordColumn;
   final Integer tagColumn;
   final TreeReaderFactory trf;
+  final boolean usesComments;
 
   private TaggedFileRecord(String file, Format format,
                            String encoding, String tagSeparator,
@@ -45,7 +46,8 @@ private TaggedFileRecord(String file, Format format,
                            TreeReaderFactory trf,
                            NumberRangesFileFilter treeRange,
                            Predicate<Tree> treeFilter,
-                           Integer wordColumn, Integer tagColumn) {
+                           Integer wordColumn, Integer tagColumn,
+                           boolean usesComments) {
     this.file = file;
     this.format = format;
     this.encoding = encoding;
@@ -57,6 +59,7 @@ private TaggedFileRecord(String file, Format format,
     this.wordColumn = wordColumn;
     this.tagColumn = tagColumn;
     this.trf = trf;
+    this.usesComments = usesComments;
   }
 
   public static final String FORMAT = "format";
@@ -69,6 +72,7 @@ private TaggedFileRecord(String file, Format format,
   public static final String WORD_COLUMN = "wordColumn";
   public static final String TAG_COLUMN = "tagColumn";
   public static final String TREE_READER = "trf";
+  public static final String COMMENTS = "comments";
 
   public String toString() {
     StringBuilder s = new StringBuilder();
@@ -100,6 +104,9 @@ public String toString() {
     if (tagColumn != null) {
       s.append("," + TAG_COLUMN + "=" + tagColumn);
     }
+    if (usesComments) {
+      s.append("," + COMMENTS + "=true");
+    }
     return s.toString();
   }
 
@@ -135,7 +142,7 @@ public static TaggedFileRecord createRecord(Properties config,
       return new TaggedFileRecord(description, Format.TEXT,
                                   getEncoding(config),
                                   getTagSeparator(config),
-                                  null, null, null, null, null, null, null);
+                                  null, null, null, null, null, null, null, false);
     }
 
     String[] args = new String[pieces.length - 1];
@@ -150,6 +157,7 @@ public static TaggedFileRecord createRecord(Properties config,
     NumberRangesFileFilter treeRange = null;
     Predicate<Tree> treeFilter = null;
     Integer wordColumn = null, tagColumn = null;
+    boolean comments = false;
 
     for (String arg : args) {
       String[] argPieces = arg.split("=", 2);
@@ -178,14 +186,16 @@ public static TaggedFileRecord createRecord(Properties config,
         wordColumn = Integer.valueOf(argPieces[1]);
       } else if (argPieces[0].equalsIgnoreCase(TAG_COLUMN)) {
         tagColumn = Integer.valueOf(argPieces[1]);
+      } else if (argPieces[0].equalsIgnoreCase(COMMENTS)) {
+        comments = Boolean.valueOf(argPieces[1]);
       } else {
         throw new IllegalArgumentException("TaggedFileRecord argument " +
                                            argPieces[0] + " is unknown");
       }
     }
     return new TaggedFileRecord(file, format, encoding, tagSeparator,
                                 treeTransformer, treeNormalizer, trf, treeRange,
-                                treeFilter, wordColumn, tagColumn);
+                                treeFilter, wordColumn, tagColumn, comments);
   }
 
   public static String getEncoding(Properties config) {
diff --git a/src/edu/stanford/nlp/tagger/maxent/MaxentTagger.java b/src/edu/stanford/nlp/tagger/maxent/MaxentTagger.java
@@ -168,7 +168,7 @@
        The second format is a file of Penn Treebank formatted (i.e., s-expression) tree files.  Trees are loaded one at a time and the tagged words in a tree are used as a training sentence.
        To specify this format, preface the filename with "{@code format=TREES,}".  <br>
        The final possible format is TSV files (tab-separated columns).  To specify a TSV file, set {@code trainFile} to "{@code format=TSV,wordColumn=x,tagColumn=y,filename}".
-       Column numbers are indexed from 0, and sentences are separated with blank lines. The default wordColumn is 0 and default tagColumn is 1.
+       Column numbers are indexed from 0, and sentences are separated with blank lines. The default wordColumn is 0 and default tagColumn is 1.  If comments=true, then comment lines will be skipped (a common thing to appear in conllu files)
        <br>
        A file can be in a different character set encoding than the tagger's default encoding by prefacing the filename with {@code "encoding=ENC,"}.
        You can specify the tagSeparator character in a TEXT file by prefacing the filename with "tagSeparator=c,". <br>