11package edu .stanford .nlp .trees .ud ;
22
33import edu .stanford .nlp .ling .CoreAnnotations ;
4+ import edu .stanford .nlp .ling .AbstractCoreLabel ;
45import edu .stanford .nlp .ling .CoreLabel ;
56import edu .stanford .nlp .ling .IndexedWord ;
67import edu .stanford .nlp .semgraph .SemanticGraph ;
@@ -46,11 +47,7 @@ public String printSemanticGraph(SemanticGraph basicSg, SemanticGraph enhancedSg
4647 for (IndexedWord token : tokenSg .vertexListSorted ()) {
4748 /* Check for multiword tokens. */
4849 if (token .containsKey (CoreAnnotations .CoNLLUTokenSpanAnnotation .class )) {
49- IntPair tokenSpan = token .get (CoreAnnotations .CoNLLUTokenSpanAnnotation .class );
50- if (tokenSpan .getSource () == token .index ()) {
51- String range = String .format ("%d-%d" , tokenSpan .getSource (), tokenSpan .getTarget ());
52- sb .append (String .format ("%s\t %s\t _\t _\t _\t _\t _\t _\t _\t _%n" , range , token .originalText ()));
53- }
50+ printSpan (sb , token );
5451 }
5552
5653 /* Try to find main governor and additional dependencies. */
@@ -132,6 +129,17 @@ public String printSemanticGraph(SemanticGraph basicSg, SemanticGraph enhancedSg
132129 return sb .toString ();
133130 }
134131
132+ /**
133+ * Outputs just one token span (MWT)
134+ */
135+ public static void printSpan (StringBuilder sb , AbstractCoreLabel token ) {
136+ IntPair tokenSpan = token .get (CoreAnnotations .CoNLLUTokenSpanAnnotation .class );
137+ if (tokenSpan .getSource () == token .index ()) {
138+ String range = String .format ("%d-%d" , tokenSpan .getSource (), tokenSpan .getTarget ());
139+ sb .append (String .format ("%s\t %s\t _\t _\t _\t _\t _\t _\t _\t _%n" , range , token .originalText ()));
140+ }
141+ }
142+
135143 /**
136144 * Outputs a partial CONLL-U file with token information (form, lemma, POS)
137145 * but without any dependency information.
@@ -140,18 +148,38 @@ public String printSemanticGraph(SemanticGraph basicSg, SemanticGraph enhancedSg
140148 * @return
141149 */
142150
143- public String printPOSAnnotations (CoreMap sentence ) {
151+ public String printPOSAnnotations (CoreMap sentence , boolean fakeDeps ) {
144152 StringBuilder sb = new StringBuilder ();
145153
154+ int index = 0 ;
146155 for (CoreLabel token : sentence .get (CoreAnnotations .TokensAnnotation .class )) {
156+ /* Check for multiword tokens. */
157+ if (token .containsKey (CoreAnnotations .CoNLLUTokenSpanAnnotation .class )) {
158+ printSpan (sb , token );
159+ }
147160
148161 String upos = token .getString (CoreAnnotations .CoarseTagAnnotation .class , "_" );
149162 String lemma = token .getString (CoreAnnotations .LemmaAnnotation .class , "_" );
150163 String pos = token .getString (CoreAnnotations .PartOfSpeechAnnotation .class , "_" );
151164 String featuresString = CoNLLUUtils .toFeatureString (token .get (CoreAnnotations .CoNLLUFeats .class ));
152165 String misc = token .getString (CoreAnnotations .CoNLLUMisc .class , "_" );
166+ final String head ;
167+ final String rel ;
168+ final String headrel ;
169+ if (fakeDeps ) {
170+ // deps count from 1, with 0 as the root.
171+ // we will have the first word go to fake root
172+ head = Integer .toString (index );
173+ rel = (index == 0 ) ? "root" : "dep" ;
174+ headrel = head + ":" + rel ;
175+ } else {
176+ head = "_" ;
177+ rel = "_" ;
178+ headrel = "_" ;
179+ }
180+ index ++;
153181 sb .append (String .format ("%s\t %s\t %s\t %s\t %s\t %s\t %s\t %s\t %s\t %s%n" , token .index (), token .word (),
154- lemma , upos , pos , featuresString , "_" , "_" , "_" , misc ));
182+ lemma , upos , pos , featuresString , head , rel , headrel , misc ));
155183 }
156184 sb .append (System .lineSeparator ());
157185
0 commit comments