Skip to content

Commit 89c93e1

Browse files
committed
Add fake dep indices as an option for the tokens-only version. This way, the output can be used for the conllu scoring script. Also, add MWT spans to the tokens-only conllu output
1 parent fffad1a commit 89c93e1

File tree

3 files changed

+41
-8
lines changed

3 files changed

+41
-8
lines changed

src/edu/stanford/nlp/pipeline/AnnotationOutputter.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,9 @@ public static class Options {
6363
public final double relationsBeam;
6464
/** Columns to print in CoNLL output. */
6565
public final List<Class<? extends CoreAnnotation<?>>> keysToPrint;
66+
/** Print some fake dependency info in the CoNLL output.
67+
Useful for the original conll eval script, for example */
68+
public final boolean printFakeDeps;
6669

6770

6871
public Options() {
@@ -80,6 +83,7 @@ public Options(boolean pretty) {
8083
printSingletons = false;
8184
relationsBeam = 0.0;
8285
keysToPrint = getKeysToPrint(DEFAULT_KEYS);
86+
printFakeDeps = false;
8387
}
8488

8589
public Options(Properties properties) {
@@ -94,6 +98,7 @@ public Options(Properties properties) {
9498
printSingletons = PropertiesUtils.getBool(properties, "output.printSingletonEntities", false);
9599
relationsBeam = PropertiesUtils.getDouble(properties, "output.relation.beam", 0.0);
96100
keysToPrint = getKeysToPrint(properties.getProperty("output.columns", DEFAULT_KEYS));
101+
printFakeDeps = PropertiesUtils.getBool(properties, "output.printFakeDeps", false);
97102
}
98103

99104
private static List<Class<? extends CoreAnnotation<?>>> getKeysToPrint(String columns) {

src/edu/stanford/nlp/pipeline/CoNLLUOutputter.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ public void print(Annotation doc, OutputStream target, Options options) throws I
129129
throw new IllegalArgumentException("CoNLLUOutputter: unknown dependencies type " + dependenciesType);
130130
}
131131
} else {
132-
writer.print(conllUWriter.printPOSAnnotations(sentence));
132+
writer.print(conllUWriter.printPOSAnnotations(sentence, options.printFakeDeps));
133133
}
134134
}
135135
writer.flush();

src/edu/stanford/nlp/trees/ud/CoNLLUDocumentWriter.java

Lines changed: 35 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package edu.stanford.nlp.trees.ud;
22

33
import edu.stanford.nlp.ling.CoreAnnotations;
4+
import edu.stanford.nlp.ling.AbstractCoreLabel;
45
import edu.stanford.nlp.ling.CoreLabel;
56
import edu.stanford.nlp.ling.IndexedWord;
67
import edu.stanford.nlp.semgraph.SemanticGraph;
@@ -46,11 +47,7 @@ public String printSemanticGraph(SemanticGraph basicSg, SemanticGraph enhancedSg
4647
for (IndexedWord token : tokenSg.vertexListSorted()) {
4748
/* Check for multiword tokens. */
4849
if (token.containsKey(CoreAnnotations.CoNLLUTokenSpanAnnotation.class)) {
49-
IntPair tokenSpan = token.get(CoreAnnotations.CoNLLUTokenSpanAnnotation.class);
50-
if (tokenSpan.getSource() == token.index()) {
51-
String range = String.format("%d-%d", tokenSpan.getSource(), tokenSpan.getTarget());
52-
sb.append(String.format("%s\t%s\t_\t_\t_\t_\t_\t_\t_\t_%n", range, token.originalText()));
53-
}
50+
printSpan(sb, token);
5451
}
5552

5653
/* Try to find main governor and additional dependencies. */
@@ -132,6 +129,17 @@ public String printSemanticGraph(SemanticGraph basicSg, SemanticGraph enhancedSg
132129
return sb.toString();
133130
}
134131

132+
/**
133+
* Outputs just one token span (MWT)
134+
*/
135+
public static void printSpan(StringBuilder sb, AbstractCoreLabel token) {
136+
IntPair tokenSpan = token.get(CoreAnnotations.CoNLLUTokenSpanAnnotation.class);
137+
if (tokenSpan.getSource() == token.index()) {
138+
String range = String.format("%d-%d", tokenSpan.getSource(), tokenSpan.getTarget());
139+
sb.append(String.format("%s\t%s\t_\t_\t_\t_\t_\t_\t_\t_%n", range, token.originalText()));
140+
}
141+
}
142+
135143
/**
136144
* Outputs a partial CONLL-U file with token information (form, lemma, POS)
137145
* but without any dependency information.
@@ -140,18 +148,38 @@ public String printSemanticGraph(SemanticGraph basicSg, SemanticGraph enhancedSg
140148
* @return
141149
*/
142150

143-
public String printPOSAnnotations(CoreMap sentence) {
151+
public String printPOSAnnotations(CoreMap sentence, boolean fakeDeps) {
144152
StringBuilder sb = new StringBuilder();
145153

154+
int index = 0;
146155
for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) {
156+
/* Check for multiword tokens. */
157+
if (token.containsKey(CoreAnnotations.CoNLLUTokenSpanAnnotation.class)) {
158+
printSpan(sb, token);
159+
}
147160

148161
String upos = token.getString(CoreAnnotations.CoarseTagAnnotation.class, "_");
149162
String lemma = token.getString(CoreAnnotations.LemmaAnnotation.class, "_");
150163
String pos = token.getString(CoreAnnotations.PartOfSpeechAnnotation.class, "_");
151164
String featuresString = CoNLLUUtils.toFeatureString(token.get(CoreAnnotations.CoNLLUFeats.class));
152165
String misc = token.getString(CoreAnnotations.CoNLLUMisc.class, "_");
166+
final String head;
167+
final String rel;
168+
final String headrel;
169+
if (fakeDeps) {
170+
// deps count from 1, with 0 as the root.
171+
// we will have the first word go to fake root
172+
head = Integer.toString(index);
173+
rel = (index == 0) ? "root" : "dep";
174+
headrel = head + ":" + rel;
175+
} else {
176+
head = "_";
177+
rel = "_";
178+
headrel = "_";
179+
}
180+
index++;
153181
sb.append(String.format("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s%n", token.index(), token.word(),
154-
lemma, upos , pos, featuresString, "_", "_", "_", misc));
182+
lemma, upos , pos, featuresString, head, rel, headrel, misc));
155183
}
156184
sb.append(System.lineSeparator());
157185

0 commit comments

Comments
 (0)