Skip to content

Commit 7cb54d7

Browse files
committed
Condense umlauts (currently only in German). Addresses #1184
1 parent dba0752 commit 7cb54d7

File tree

2 files changed

+78
-0
lines changed

2 files changed

+78
-0
lines changed

itest/src/edu/stanford/nlp/international/german/process/GermanTokenizerPostProcessorITest.java

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,4 +49,20 @@ public void testPostProcessor() {
4949
testExample(abbreviationExample,abbreviationExampleGoldTokens);
5050
}
5151

52+
/**
53+
* You probably can't tell in your editor, but the input has 4 characters for
54+
* <pre>für</pre>
55+
* and the output has 3
56+
*/
57+
public void testUmlauts() {
58+
String fur = "für";
59+
assertEquals(4, fur.length());
60+
61+
String furry = "für";
62+
assertEquals(3, furry.length());
63+
64+
String umlautExample = "Welcher der Befunde ist " + fur + " eine Gehirnerkrankung typisch?";
65+
List<String> umlautGoldTokens = Arrays.asList("Welcher", "der", "Befunde", "ist", furry, "eine", "Gehirnerkrankung", "typisch", "?");
66+
testExample(umlautExample, umlautGoldTokens);
67+
}
5268
}

src/edu/stanford/nlp/international/german/process/GermanTokenizerPostProcessor.java

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,64 @@ public static void mergeTokens(CoreLabel token, CoreLabel nextToken) {
4545
token.setValue(token.word()+"-"+token.sentIndex());
4646
}
4747

48+
/**
49+
* Some people write umlauts as two characters instead of just one
50+
*<br>
51+
* German CoreNLP doesn't handle the two character versions correctly,
52+
* so here we condense it into the one character version
53+
*/
54+
public static void condenseUmlauts(CoreLabel token) {
55+
String value = token.value();
56+
String updatedValue = condenseUmlauts(value);
57+
if (updatedValue != null) {
58+
token.setValue(updatedValue);
59+
}
60+
61+
String word = token.word();
62+
String updatedWord = condenseUmlauts(word);
63+
if (updatedWord != null) {
64+
token.setWord(updatedWord);
65+
}
66+
}
67+
68+
public static String condenseUmlauts(String value) {
69+
StringBuilder ns = null;
70+
for (int i = 0; i < value.length(); ++i) {
71+
final char cur = value.charAt(i);
72+
if ((int) cur == 776) {
73+
// this is the umlaut character
74+
if (ns == null) {
75+
ns = new StringBuilder(value.length());
76+
ns.append(value.substring(0, i));
77+
}
78+
final char prev = ns.charAt(ns.length() - 1);
79+
if (prev == 'a') {
80+
ns.setCharAt(ns.length() - 1, 'ä');
81+
} else if (prev == 'A') {
82+
ns.setCharAt(ns.length() - 1, 'Ä');
83+
} else if (prev == 'o') {
84+
ns.setCharAt(ns.length() - 1, 'ö');
85+
} else if (prev == 'O') {
86+
ns.setCharAt(ns.length() - 1, 'Ö');
87+
} else if (prev == 'u') {
88+
ns.setCharAt(ns.length() - 1, 'ü');
89+
} else if (prev == 'U') {
90+
ns.setCharAt(ns.length() - 1, 'Ü');
91+
} else {
92+
ns.append(cur);
93+
}
94+
} else {
95+
if (ns != null) {
96+
ns.append(cur);
97+
}
98+
}
99+
}
100+
if (ns != null) {
101+
return ns.toString();
102+
}
103+
return null;
104+
}
105+
48106
@Override
49107
public List<CoreLabel> process(List<CoreLabel> tokens) {
50108
List<CoreLabel> processedTokens = new ArrayList<CoreLabel>();
@@ -75,6 +133,10 @@ public List<CoreLabel> process(List<CoreLabel> tokens) {
75133
processedTokens.add(currToken);
76134
}
77135
}
136+
137+
for (CoreLabel label : processedTokens) {
138+
condenseUmlauts(label);
139+
}
78140
return processedTokens;
79141
}
80142

0 commit comments

Comments
 (0)