Merge pull request #681 from schen149/master

mssammon · web-flow · commit bce700cd0f70 · 2018-08-29T17:50:17.000-04:00
A temporary fix to sentence splitter behavior in TokenizerStateMachine when dealing with acronyms; Also fixing tokenizer unit tests to ensure test passing when using windows style line separator
diff --git a/tokenizer/src/main/java/edu/illinois/cs/cogcomp/nlp/tokenizer/Acronyms.java b/tokenizer/src/main/java/edu/illinois/cs/cogcomp/nlp/tokenizer/Acronyms.java
@@ -60,7 +60,7 @@ static ArrayList<String> get(char key) {
             "Nev", "NV", "N.H", "NH", "N.J", "NJ", "N.M", "NM", "N.Y", "NY", "N.C", "NC", "N.D",
             "ND", "MP", "OH", "Okla", "OK", "Ore", "OR", "pub", "PW", "Pa", "PA", "P.R", "PR",
             "R.I", "RI", "S.C", "SC", "S.D", "SD", "Tenn", "TN", "Tex", "TX", "UT", "Vt", "VT",
-            "Va", "VA", "V.I", "VI", "Wash", "WA", "W.Va", "WV", "Wis", "WI", "Wyo", "WY"};
+            "Va", "VA", "V.I", "VI", "Wash", "WA", "W.Va", "WV", "Wis", "WI", "Wyo", "WY", "Fr"};
 
     // init the abbr data structure.
     static {
diff --git a/tokenizer/src/main/java/edu/illinois/cs/cogcomp/nlp/tokenizer/TokenizerStateMachine.java b/tokenizer/src/main/java/edu/illinois/cs/cogcomp/nlp/tokenizer/TokenizerStateMachine.java
@@ -332,9 +332,14 @@ public void process(char token) {
                                 } else {
                                     // check for all uppercase and periods back to the start of the
                                     // word or a "-"
+                                    char nextnextChar = peek(2);
                                     if (getCurrent().isAbbr())
                                         return; // previous was upper case, acronym and word
                                                 // continues
+                                    else if (Character.isLowerCase(nextnextChar))
+                                        return; // when the next char is white space and the next next char
+                                                // is lowercase, we know that the next word is not start of
+                                                // a sentence, so we continue.
                                     else
                                         ; // we will pass through, this is not an acronym, so must
                                           // be a special character.
diff --git a/tokenizer/src/test/java/edu/illinois/cs/cogcomp/nlp/tokenizer/StatefullTokenizerTest.java b/tokenizer/src/test/java/edu/illinois/cs/cogcomp/nlp/tokenizer/StatefullTokenizerTest.java
@@ -152,7 +152,8 @@ public void testStatefulTokenizerMultiline() {
         IntPair notOffsets = new IntPair(42, 45);
         assertEquals(notOffsets, tokenOffsets[notIndex]);
         int intolerantIndex = 14;
-        IntPair intolerantOffsets = new IntPair(77, 87);
+        int lineSepLength = System.lineSeparator().length();
+        IntPair intolerantOffsets = new IntPair(76 + lineSepLength, 86 + lineSepLength);
         assertEquals(intolerantOffsets, tokenOffsets[intolerantIndex]);
     }
     
@@ -378,7 +379,18 @@ public void testSplitOnDash() {
         Tokenizer.Tokenization tknzn = tkr.tokenizeTextSpan(text);
         assertEquals(tknzn.getTokens().length, 6);
     }
-    
+
+    /**
+     * Test sentence splitter behavior when a there is a lower cased acronym followed immediately by a dot.
+     */
+    @Test
+    public void testLowerCaseAcronymEndWithDot(){
+        TokenizerTextAnnotationBuilder tab =
+                new TokenizerTextAnnotationBuilder(new StatefulTokenizer(true, true));
+        String text = "I was born in Urbana, Il. in 1992.";
+        TextAnnotation ta = tab.createTextAnnotation(text);
+        assertEquals(ta.getNumberOfSentences(), 1);
+    }
     /**
      * This can be used to just quickly debug when a sentence produces an error.
      * @param args
diff --git a/tokenizer/src/test/java/edu/illinois/cs/cogcomp/nlp/tokenizer/TokenizerTextAnnotationBuilderTest.java b/tokenizer/src/test/java/edu/illinois/cs/cogcomp/nlp/tokenizer/TokenizerTextAnnotationBuilderTest.java
@@ -32,13 +32,16 @@ public void testBuilder() {
                 "Mr. Dawkins -- a liberal professor -- doesn't like fundamentalists.   ";
         final String sentB = "He is intolerant of intolerance!";
 
-        final int refSentStartOffset = 71;
-        final int refSentEndOffset = 103;
+        String lineSep = System.lineSeparator();
+        int lineSepLength = lineSep.length();
 
-        final int refTokStartOffset = 77;
+        final int refSentStartOffset = 70 + lineSepLength;
+        final int refSentEndOffset = 102 + lineSepLength;
+
+        final int refTokStartOffset = 76 + lineSepLength;
         final int refTokEndOffset = refTokStartOffset + 10;
 
-        final String text = sentA + System.lineSeparator() + sentB;
+        final String text = sentA + lineSep + sentB;
 
         TextAnnotation ta = bldr.createTextAnnotation("test", "test", text);