Skip to content

Commit 07a320b

Browse files
committed
Add comments=true as an option to the TSV reader in MaxentTagger. Allows universaldependencies files to be used directly.
1 parent f1f5c91 commit 07a320b

File tree

3 files changed

+28
-14
lines changed

3 files changed

+28
-14
lines changed

src/edu/stanford/nlp/tagger/io/TSVTaggedFileReader.java

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ public class TSVTaggedFileReader implements TaggedFileReader {
1515
private final BufferedReader reader;
1616
private final String filename;
1717
private final int wordColumn, tagColumn;
18+
private final boolean usesComments;
1819
private List<TaggedWord> next; // = null;
1920
private int linesRead; // = 0;
2021

@@ -34,6 +35,7 @@ public TSVTaggedFileReader(TaggedFileRecord record) {
3435
DEFAULT_WORD_COLUMN : record.wordColumn);
3536
tagColumn = ((record.tagColumn == null) ?
3637
DEFAULT_TAG_COLUMN : record.tagColumn);
38+
usesComments = record.usesComments;
3739
primeNext();
3840
}
3941

@@ -58,9 +60,9 @@ public List<TaggedWord> next() {
5860

5961

6062
private void primeNext() {
61-
// eat all blank lines until we hit the next block of text
63+
// eat all blank lines (and maybe comments) until we hit the next block of text
6264
String line = "";
63-
while (line.trim().isEmpty()) {
65+
while (line.trim().isEmpty() || (usesComments && line.startsWith("#"))) {
6466
try {
6567
line = reader.readLine();
6668
++linesRead;
@@ -77,14 +79,16 @@ private void primeNext() {
7779
// ends the sentence.
7880
next = new ArrayList<>();
7981
while (line != null && ! line.trim().isEmpty()) {
80-
String[] pieces = line.split("\t");
81-
if (pieces.length <= wordColumn || pieces.length <= tagColumn) {
82-
throw new IllegalArgumentException("File " + filename + " line #" +
83-
linesRead + " too short");
82+
if (!(usesComments && line.startsWith("#"))) {
83+
String[] pieces = line.split("\t");
84+
if (pieces.length <= wordColumn || pieces.length <= tagColumn) {
85+
throw new IllegalArgumentException("File " + filename + " line #" +
86+
linesRead + " too short");
87+
}
88+
String word = pieces[wordColumn];
89+
String tag = pieces[tagColumn];
90+
next.add(new TaggedWord(word, tag));
8491
}
85-
String word = pieces[wordColumn];
86-
String tag = pieces[tagColumn];
87-
next.add(new TaggedWord(word, tag));
8892
try {
8993
line = reader.readLine();
9094
++linesRead;
@@ -97,4 +101,4 @@ private void primeNext() {
97101
@Override
98102
public void remove() { throw new UnsupportedOperationException(); }
99103

100-
}
104+
}

src/edu/stanford/nlp/tagger/io/TaggedFileRecord.java

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ public enum Format {
3737
final Integer wordColumn;
3838
final Integer tagColumn;
3939
final TreeReaderFactory trf;
40+
final boolean usesComments;
4041

4142
private TaggedFileRecord(String file, Format format,
4243
String encoding, String tagSeparator,
@@ -45,7 +46,8 @@ private TaggedFileRecord(String file, Format format,
4546
TreeReaderFactory trf,
4647
NumberRangesFileFilter treeRange,
4748
Predicate<Tree> treeFilter,
48-
Integer wordColumn, Integer tagColumn) {
49+
Integer wordColumn, Integer tagColumn,
50+
boolean usesComments) {
4951
this.file = file;
5052
this.format = format;
5153
this.encoding = encoding;
@@ -57,6 +59,7 @@ private TaggedFileRecord(String file, Format format,
5759
this.wordColumn = wordColumn;
5860
this.tagColumn = tagColumn;
5961
this.trf = trf;
62+
this.usesComments = usesComments;
6063
}
6164

6265
public static final String FORMAT = "format";
@@ -69,6 +72,7 @@ private TaggedFileRecord(String file, Format format,
6972
public static final String WORD_COLUMN = "wordColumn";
7073
public static final String TAG_COLUMN = "tagColumn";
7174
public static final String TREE_READER = "trf";
75+
public static final String COMMENTS = "comments";
7276

7377
public String toString() {
7478
StringBuilder s = new StringBuilder();
@@ -100,6 +104,9 @@ public String toString() {
100104
if (tagColumn != null) {
101105
s.append("," + TAG_COLUMN + "=" + tagColumn);
102106
}
107+
if (usesComments) {
108+
s.append("," + COMMENTS + "=true");
109+
}
103110
return s.toString();
104111
}
105112

@@ -135,7 +142,7 @@ public static TaggedFileRecord createRecord(Properties config,
135142
return new TaggedFileRecord(description, Format.TEXT,
136143
getEncoding(config),
137144
getTagSeparator(config),
138-
null, null, null, null, null, null, null);
145+
null, null, null, null, null, null, null, false);
139146
}
140147

141148
String[] args = new String[pieces.length - 1];
@@ -150,6 +157,7 @@ public static TaggedFileRecord createRecord(Properties config,
150157
NumberRangesFileFilter treeRange = null;
151158
Predicate<Tree> treeFilter = null;
152159
Integer wordColumn = null, tagColumn = null;
160+
boolean comments = false;
153161

154162
for (String arg : args) {
155163
String[] argPieces = arg.split("=", 2);
@@ -178,14 +186,16 @@ public static TaggedFileRecord createRecord(Properties config,
178186
wordColumn = Integer.valueOf(argPieces[1]);
179187
} else if (argPieces[0].equalsIgnoreCase(TAG_COLUMN)) {
180188
tagColumn = Integer.valueOf(argPieces[1]);
189+
} else if (argPieces[0].equalsIgnoreCase(COMMENTS)) {
190+
comments = Boolean.valueOf(argPieces[1]);
181191
} else {
182192
throw new IllegalArgumentException("TaggedFileRecord argument " +
183193
argPieces[0] + " is unknown");
184194
}
185195
}
186196
return new TaggedFileRecord(file, format, encoding, tagSeparator,
187197
treeTransformer, treeNormalizer, trf, treeRange,
188-
treeFilter, wordColumn, tagColumn);
198+
treeFilter, wordColumn, tagColumn, comments);
189199
}
190200

191201
public static String getEncoding(Properties config) {

src/edu/stanford/nlp/tagger/maxent/MaxentTagger.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,7 @@
168168
The second format is a file of Penn Treebank formatted (i.e., s-expression) tree files. Trees are loaded one at a time and the tagged words in a tree are used as a training sentence.
169169
To specify this format, preface the filename with "{@code format=TREES,}". <br>
170170
The final possible format is TSV files (tab-separated columns). To specify a TSV file, set {@code trainFile} to "{@code format=TSV,wordColumn=x,tagColumn=y,filename}".
171-
Column numbers are indexed from 0, and sentences are separated with blank lines. The default wordColumn is 0 and default tagColumn is 1.
171+
Column numbers are indexed from 0, and sentences are separated with blank lines. The default wordColumn is 0 and default tagColumn is 1. If comments=true, then comment lines will be skipped (a common thing to appear in conllu files)
172172
<br>
173173
A file can be in a different character set encoding than the tagger's default encoding by prefacing the filename with {@code "encoding=ENC,"}.
174174
You can specify the tagSeparator character in a TEXT file by prefacing the filename with "tagSeparator=c,". <br>

0 commit comments

Comments
 (0)