@@ -45,6 +45,64 @@ public static void mergeTokens(CoreLabel token, CoreLabel nextToken) {
4545 token .setValue (token .word ()+"-" +token .sentIndex ());
4646 }
4747
48+ /**
49+ * Some people write umlauts as two characters instead of just one
50+ *<br>
51+ * German CoreNLP doesn't handle the two character versions correctly,
52+ * so here we condense it into the one character version
53+ */
54+ public static void condenseUmlauts (CoreLabel token ) {
55+ String value = token .value ();
56+ String updatedValue = condenseUmlauts (value );
57+ if (updatedValue != null ) {
58+ token .setValue (updatedValue );
59+ }
60+
61+ String word = token .word ();
62+ String updatedWord = condenseUmlauts (word );
63+ if (updatedWord != null ) {
64+ token .setWord (updatedWord );
65+ }
66+ }
67+
68+ public static String condenseUmlauts (String value ) {
69+ StringBuilder ns = null ;
70+ for (int i = 0 ; i < value .length (); ++i ) {
71+ final char cur = value .charAt (i );
72+ if ((int ) cur == 776 ) {
73+ // this is the umlaut character
74+ if (ns == null ) {
75+ ns = new StringBuilder (value .length ());
76+ ns .append (value .substring (0 , i ));
77+ }
78+ final char prev = ns .charAt (ns .length () - 1 );
79+ if (prev == 'a' ) {
80+ ns .setCharAt (ns .length () - 1 , 'ä' );
81+ } else if (prev == 'A' ) {
82+ ns .setCharAt (ns .length () - 1 , 'Ä' );
83+ } else if (prev == 'o' ) {
84+ ns .setCharAt (ns .length () - 1 , 'ö' );
85+ } else if (prev == 'O' ) {
86+ ns .setCharAt (ns .length () - 1 , 'Ö' );
87+ } else if (prev == 'u' ) {
88+ ns .setCharAt (ns .length () - 1 , 'ü' );
89+ } else if (prev == 'U' ) {
90+ ns .setCharAt (ns .length () - 1 , 'Ü' );
91+ } else {
92+ ns .append (cur );
93+ }
94+ } else {
95+ if (ns != null ) {
96+ ns .append (cur );
97+ }
98+ }
99+ }
100+ if (ns != null ) {
101+ return ns .toString ();
102+ }
103+ return null ;
104+ }
105+
48106 @ Override
49107 public List <CoreLabel > process (List <CoreLabel > tokens ) {
50108 List <CoreLabel > processedTokens = new ArrayList <CoreLabel >();
@@ -75,6 +133,10 @@ public List<CoreLabel> process(List<CoreLabel> tokens) {
75133 processedTokens .add (currToken );
76134 }
77135 }
136+
137+ for (CoreLabel label : processedTokens ) {
138+ condenseUmlauts (label );
139+ }
78140 return processedTokens ;
79141 }
80142
0 commit comments