Skip to content

Commit bce700c

Browse files
authored
Merge pull request #681 from schen149/master
A temporary fix to sentence splitter behavior in TokenizerStateMachine when dealing with acronyms; Also fixing tokenizer unit tests to ensure test passing when using windows style line separator
2 parents 4209308 + be60f89 commit bce700c

File tree

4 files changed

+27
-7
lines changed

4 files changed

+27
-7
lines changed

tokenizer/src/main/java/edu/illinois/cs/cogcomp/nlp/tokenizer/Acronyms.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ static ArrayList<String> get(char key) {
6060
"Nev", "NV", "N.H", "NH", "N.J", "NJ", "N.M", "NM", "N.Y", "NY", "N.C", "NC", "N.D",
6161
"ND", "MP", "OH", "Okla", "OK", "Ore", "OR", "pub", "PW", "Pa", "PA", "P.R", "PR",
6262
"R.I", "RI", "S.C", "SC", "S.D", "SD", "Tenn", "TN", "Tex", "TX", "UT", "Vt", "VT",
63-
"Va", "VA", "V.I", "VI", "Wash", "WA", "W.Va", "WV", "Wis", "WI", "Wyo", "WY"};
63+
"Va", "VA", "V.I", "VI", "Wash", "WA", "W.Va", "WV", "Wis", "WI", "Wyo", "WY", "Fr"};
6464

6565
// init the abbr data structure.
6666
static {

tokenizer/src/main/java/edu/illinois/cs/cogcomp/nlp/tokenizer/TokenizerStateMachine.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -332,9 +332,14 @@ public void process(char token) {
332332
} else {
333333
// check for all uppercase and periods back to the start of the
334334
// word or a "-"
335+
char nextnextChar = peek(2);
335336
if (getCurrent().isAbbr())
336337
return; // previous was upper case, acronym and word
337338
// continues
339+
else if (Character.isLowerCase(nextnextChar))
340+
return; // when the next char is white space and the next next char
341+
// is lowercase, we know that the next word is not start of
342+
// a sentence, so we continue.
338343
else
339344
; // we will pass through, this is not an acronym, so must
340345
// be a special character.

tokenizer/src/test/java/edu/illinois/cs/cogcomp/nlp/tokenizer/StatefullTokenizerTest.java

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,8 @@ public void testStatefulTokenizerMultiline() {
152152
IntPair notOffsets = new IntPair(42, 45);
153153
assertEquals(notOffsets, tokenOffsets[notIndex]);
154154
int intolerantIndex = 14;
155-
IntPair intolerantOffsets = new IntPair(77, 87);
155+
int lineSepLength = System.lineSeparator().length();
156+
IntPair intolerantOffsets = new IntPair(76 + lineSepLength, 86 + lineSepLength);
156157
assertEquals(intolerantOffsets, tokenOffsets[intolerantIndex]);
157158
}
158159

@@ -378,7 +379,18 @@ public void testSplitOnDash() {
378379
Tokenizer.Tokenization tknzn = tkr.tokenizeTextSpan(text);
379380
assertEquals(tknzn.getTokens().length, 6);
380381
}
381-
382+
383+
/**
384+
* Test sentence splitter behavior when a there is a lower cased acronym followed immediately by a dot.
385+
*/
386+
@Test
387+
public void testLowerCaseAcronymEndWithDot(){
388+
TokenizerTextAnnotationBuilder tab =
389+
new TokenizerTextAnnotationBuilder(new StatefulTokenizer(true, true));
390+
String text = "I was born in Urbana, Il. in 1992.";
391+
TextAnnotation ta = tab.createTextAnnotation(text);
392+
assertEquals(ta.getNumberOfSentences(), 1);
393+
}
382394
/**
383395
* This can be used to just quickly debug when a sentence produces an error.
384396
* @param args

tokenizer/src/test/java/edu/illinois/cs/cogcomp/nlp/tokenizer/TokenizerTextAnnotationBuilderTest.java

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,13 +32,16 @@ public void testBuilder() {
3232
"Mr. Dawkins -- a liberal professor -- doesn't like fundamentalists. ";
3333
final String sentB = "He is intolerant of intolerance!";
3434

35-
final int refSentStartOffset = 71;
36-
final int refSentEndOffset = 103;
35+
String lineSep = System.lineSeparator();
36+
int lineSepLength = lineSep.length();
3737

38-
final int refTokStartOffset = 77;
38+
final int refSentStartOffset = 70 + lineSepLength;
39+
final int refSentEndOffset = 102 + lineSepLength;
40+
41+
final int refTokStartOffset = 76 + lineSepLength;
3942
final int refTokEndOffset = refTokStartOffset + 10;
4043

41-
final String text = sentA + System.lineSeparator() + sentB;
44+
final String text = sentA + lineSep + sentB;
4245

4346
TextAnnotation ta = bldr.createTextAnnotation("test", "test", text);
4447

0 commit comments

Comments
 (0)