Condense umlauts (currently only in German). Addresses #1184

AngledLuffa · AngledLuffa · commit 7cb54d7550e6 · 2021-10-01T14:05:18.000-07:00
diff --git a/itest/src/edu/stanford/nlp/international/german/process/GermanTokenizerPostProcessorITest.java b/itest/src/edu/stanford/nlp/international/german/process/GermanTokenizerPostProcessorITest.java
@@ -49,4 +49,20 @@ public void testPostProcessor() {
     testExample(abbreviationExample,abbreviationExampleGoldTokens);
   }
 
+  /**
+   * You probably can't tell in your editor, but the input has 4 characters for 
+   * <pre>für</pre>
+   * and the output has 3
+   */
+  public void testUmlauts() {
+    String fur = "für";
+    assertEquals(4, fur.length());
+
+    String furry = "für";
+    assertEquals(3, furry.length());
+
+    String umlautExample = "Welcher der Befunde ist " + fur + " eine Gehirnerkrankung typisch?";
+    List<String> umlautGoldTokens = Arrays.asList("Welcher", "der", "Befunde", "ist", furry, "eine", "Gehirnerkrankung", "typisch", "?");
+    testExample(umlautExample, umlautGoldTokens);
+  }
 }
diff --git a/src/edu/stanford/nlp/international/german/process/GermanTokenizerPostProcessor.java b/src/edu/stanford/nlp/international/german/process/GermanTokenizerPostProcessor.java
@@ -45,6 +45,64 @@ public static void mergeTokens(CoreLabel token, CoreLabel nextToken) {
     token.setValue(token.word()+"-"+token.sentIndex());
   }
 
+  /**
+   * Some people write umlauts as two characters instead of just one
+   *<br>
+   * German CoreNLP doesn't handle the two character versions correctly,
+   * so here we condense it into the one character version
+   */
+  public static void condenseUmlauts(CoreLabel token) {
+    String value = token.value();
+    String updatedValue = condenseUmlauts(value);
+    if (updatedValue != null) {
+      token.setValue(updatedValue);
+    }
+
+    String word = token.word();
+    String updatedWord = condenseUmlauts(word);
+    if (updatedWord != null) {
+      token.setWord(updatedWord);
+    }
+  }
+    
+  public static String condenseUmlauts(String value) {
+    StringBuilder ns = null;
+    for (int i = 0; i < value.length(); ++i) {
+      final char cur = value.charAt(i);
+      if ((int) cur == 776) {
+        // this is the umlaut character
+        if (ns == null) {
+          ns = new StringBuilder(value.length());
+          ns.append(value.substring(0, i));
+        }
+        final char prev = ns.charAt(ns.length() - 1);
+        if (prev == 'a') {
+          ns.setCharAt(ns.length() - 1, 'ä');
+        } else if (prev == 'A') {
+          ns.setCharAt(ns.length() - 1, 'Ä');
+        } else if (prev == 'o') {
+          ns.setCharAt(ns.length() - 1, 'ö');
+        } else if (prev == 'O') {
+          ns.setCharAt(ns.length() - 1, 'Ö');
+        } else if (prev == 'u') {
+          ns.setCharAt(ns.length() - 1, 'ü');
+        } else if (prev == 'U') {
+          ns.setCharAt(ns.length() - 1, 'Ü');
+        } else {
+          ns.append(cur);
+        }
+      } else {
+        if (ns != null) {
+          ns.append(cur);
+        }
+      }
+    }
+    if (ns != null) {
+      return ns.toString();
+    }
+    return null;
+  }
+
   @Override
   public List<CoreLabel> process(List<CoreLabel> tokens) {
     List<CoreLabel> processedTokens = new ArrayList<CoreLabel>();
@@ -75,6 +133,10 @@ public List<CoreLabel> process(List<CoreLabel> tokens) {
         processedTokens.add(currToken);
       }
     }
+
+    for (CoreLabel label : processedTokens) {
+      condenseUmlauts(label);
+    }
     return processedTokens;
   }