Add fake dep indices as an option for the tokens-only version. This way, the output can be used for the conllu scoring script. Also, add MWT spans to the tokens-only conllu output

AngledLuffa · AngledLuffa · commit 89c93e1a5c78 · 2021-04-17T08:24:32.000-07:00
diff --git a/src/edu/stanford/nlp/pipeline/AnnotationOutputter.java b/src/edu/stanford/nlp/pipeline/AnnotationOutputter.java
@@ -63,6 +63,9 @@ public static class Options {
     public final double relationsBeam;
     /** Columns to print in CoNLL output. */
     public final List<Class<? extends CoreAnnotation<?>>> keysToPrint;
+    /** Print some fake dependency info in the CoNLL output.
+        Useful for the original conll eval script, for example */
+    public final boolean printFakeDeps;
 
 
     public Options() {
@@ -80,6 +83,7 @@ public Options(boolean pretty) {
       printSingletons = false;
       relationsBeam = 0.0;
       keysToPrint = getKeysToPrint(DEFAULT_KEYS);
+      printFakeDeps = false;
     }
 
     public Options(Properties properties) {
@@ -94,6 +98,7 @@ public Options(Properties properties) {
       printSingletons = PropertiesUtils.getBool(properties, "output.printSingletonEntities", false);
       relationsBeam = PropertiesUtils.getDouble(properties, "output.relation.beam", 0.0);
       keysToPrint = getKeysToPrint(properties.getProperty("output.columns", DEFAULT_KEYS));
+      printFakeDeps = PropertiesUtils.getBool(properties, "output.printFakeDeps", false);
     }
 
     private static List<Class<? extends CoreAnnotation<?>>> getKeysToPrint(String columns) {
diff --git a/src/edu/stanford/nlp/pipeline/CoNLLUOutputter.java b/src/edu/stanford/nlp/pipeline/CoNLLUOutputter.java
@@ -129,7 +129,7 @@ public void print(Annotation doc, OutputStream target, Options options) throws I
           throw new IllegalArgumentException("CoNLLUOutputter: unknown dependencies type " + dependenciesType);
         }
       } else {
-        writer.print(conllUWriter.printPOSAnnotations(sentence));
+        writer.print(conllUWriter.printPOSAnnotations(sentence, options.printFakeDeps));
       }
     }
     writer.flush();
diff --git a/src/edu/stanford/nlp/trees/ud/CoNLLUDocumentWriter.java b/src/edu/stanford/nlp/trees/ud/CoNLLUDocumentWriter.java
@@ -1,6 +1,7 @@
 package edu.stanford.nlp.trees.ud;
 
 import edu.stanford.nlp.ling.CoreAnnotations;
+import edu.stanford.nlp.ling.AbstractCoreLabel;
 import edu.stanford.nlp.ling.CoreLabel;
 import edu.stanford.nlp.ling.IndexedWord;
 import edu.stanford.nlp.semgraph.SemanticGraph;
@@ -46,11 +47,7 @@ public String printSemanticGraph(SemanticGraph basicSg, SemanticGraph enhancedSg
         for (IndexedWord token : tokenSg.vertexListSorted()) {
             /* Check for multiword tokens. */
             if (token.containsKey(CoreAnnotations.CoNLLUTokenSpanAnnotation.class)) {
-                IntPair tokenSpan = token.get(CoreAnnotations.CoNLLUTokenSpanAnnotation.class);
-                if (tokenSpan.getSource() == token.index()) {
-                    String range = String.format("%d-%d", tokenSpan.getSource(), tokenSpan.getTarget());
-                    sb.append(String.format("%s\t%s\t_\t_\t_\t_\t_\t_\t_\t_%n", range, token.originalText()));
-                }
+                printSpan(sb, token);
             }
 
             /* Try to find main governor and additional dependencies. */
@@ -132,6 +129,17 @@ public String printSemanticGraph(SemanticGraph basicSg, SemanticGraph enhancedSg
         return sb.toString();
     }
 
+  /**
+   * Outputs just one token span (MWT)
+   */
+  public static void printSpan(StringBuilder sb, AbstractCoreLabel token) {
+      IntPair tokenSpan = token.get(CoreAnnotations.CoNLLUTokenSpanAnnotation.class);
+      if (tokenSpan.getSource() == token.index()) {
+          String range = String.format("%d-%d", tokenSpan.getSource(), tokenSpan.getTarget());
+          sb.append(String.format("%s\t%s\t_\t_\t_\t_\t_\t_\t_\t_%n", range, token.originalText()));
+      }
+  }
+
   /**
    * Outputs a partial CONLL-U file with token information (form, lemma, POS)
    * but without any dependency information.
@@ -140,18 +148,38 @@ public String printSemanticGraph(SemanticGraph basicSg, SemanticGraph enhancedSg
    * @return
    */
 
-  public String printPOSAnnotations(CoreMap sentence) {
+  public String printPOSAnnotations(CoreMap sentence, boolean fakeDeps) {
       StringBuilder sb = new StringBuilder();
 
+      int index = 0;
       for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) {
+          /* Check for multiword tokens. */
+          if (token.containsKey(CoreAnnotations.CoNLLUTokenSpanAnnotation.class)) {
+              printSpan(sb, token);
+          }
 
           String upos = token.getString(CoreAnnotations.CoarseTagAnnotation.class, "_");
           String lemma = token.getString(CoreAnnotations.LemmaAnnotation.class, "_");
           String pos = token.getString(CoreAnnotations.PartOfSpeechAnnotation.class, "_");
           String featuresString = CoNLLUUtils.toFeatureString(token.get(CoreAnnotations.CoNLLUFeats.class));
           String misc = token.getString(CoreAnnotations.CoNLLUMisc.class, "_");
+          final String head;
+          final String rel;
+          final String headrel;
+          if (fakeDeps) {
+            // deps count from 1, with 0 as the root.
+            // we will have the first word go to fake root
+            head = Integer.toString(index);
+            rel = (index == 0) ? "root" : "dep";
+            headrel = head + ":" + rel;
+          } else {
+            head = "_";
+            rel = "_";
+            headrel = "_";
+          }
+          index++;
           sb.append(String.format("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s%n", token.index(), token.word(),
-              lemma, upos , pos, featuresString, "_", "_", "_", misc));
+                                  lemma, upos , pos, featuresString, head, rel, headrel, misc));
       }
       sb.append(System.lineSeparator());
 

Original file line number	Diff line number	Diff line change
`@@ -129,7 +129,7 @@ public void print(Annotation doc, OutputStream target, Options options) throws I`
`129`	`129`	`throw new IllegalArgumentException("CoNLLUOutputter: unknown dependencies type " + dependenciesType);`
`130`	`130`	`}`
`131`	`131`	`} else {`
`132`		`- writer.print(conllUWriter.printPOSAnnotations(sentence));`
	`132`	`+ writer.print(conllUWriter.printPOSAnnotations(sentence, options.printFakeDeps));`
`133`	`133`	`}`
`134`	`134`	`}`
`135`	`135`	`writer.flush();`