@@ -477,7 +477,7 @@ import edu.stanford.nlp.util.logging.Redwood;
477477 }
478478
479479 private Object getNext() {
480- final String txt = yytext();
480+ String txt = yytext();
481481 return getNext(txt, txt);
482482 }
483483
@@ -589,10 +589,6 @@ SPLET = &[aeiouAEIOU](acute|grave|uml);
589589
590590%include LexCommon.tokens
591591
592- SPACENLS = {SPACENL} +
593- /* These next ones are useful to get a fixed length trailing context. */
594- SPACENL_ONE_CHAR = [ \t \u00A0\u2000 - \u200A\u202F\u3000 \r\n \u2028\u2029\u000B\u000C\u0085 ]
595- NOT_SPACENL_ONE_CHAR = [^ \t \u00A0\u2000 - \u200A\u202F\u3000 \r\n \u2028\u2029\u000B\u000C\u0085 ]
596592SENTEND1 = {SPACENL} ( {SPACENL} |[:uppercase:]| {SGML1} )
597593SENTEND2 = {SPACE} ( {SPACE} |[:uppercase:]| {SGML2} )
598594DIGIT = [:digit:]| [ \u07C0 - \u07C9 ]
@@ -672,7 +668,7 @@ SREDAUX = n{APOSETCETERA}t
672668/* [yY]' is for Y'know, y'all and I for I. So exclude from one letter first */
673669/* Rest are for French borrowings. n allows n'ts in "don'ts" */
674670/* Arguably, c'mon should be split to "c'm" + "on", but not yet. 'Twixt for betwixt */
675- APOWORD = {APOS} n{APOS} ?| [ lLdDjJ] {APOS} | Dunkin{APOS} | somethin{APOS} | ol{APOS} | {APOS} em| diff{APOSETCETERA} rent| [ A- HJ- XZn] {APOSETCETERA} [:letter:]{2}[:letter:]*| {APOS} [ 1- 9] 0s| [ 1- 9] 0{APOS} s| {APOS} till?|[:letter:][:letter:]* [ aeiouyAEIOUY] {APOSETCETERA} [ aeioulA- Z] [:letter:]*| {APOS} cause| cont' d\. ?| nor' easter| c' mon| e' er| s' mores| ev' ry| li' l| nat' l| ass' t| 'twixt| O{APOSETCETERA} o
671+ APOWORD = {APOS} n{APOS} ?| [ lLdDjJ] {APOS} |( Dunkin| somethin| ol) {APOS} | {APOS} em| diff{APOSETCETERA} rent| [ A- HJ- XZn] {APOSETCETERA} [:letter:]{2}[:letter:]*| {APOS} [ 1- 9] 0s| [ 1- 9] 0{APOS} s| {APOS} till?|[:letter:][:letter:]* [ aeiouyAEIOUY] {APOSETCETERA} [ aeioulA- Z] [:letter:]*| {APOS} cause| cont{APOSETCETERA} d\. ?| nor{APOSETCETERA} easter| c{APOSETCETERA} mon| e{APOSETCETERA} er| s{APOSETCETERA} mores| ev{APOSETCETERA} ry| li{APOSETCETERA} l| nat{APOSETCETERA} l| ass{APOSETCETERA} t| 'twixt| O{APOSETCETERA} o
676672APOWORD2 = y{APOS}
677673/* Some Wired URLs end in + or = so omit that too. Some quoting with '[' and ']' so disallow. */
678674FULLURL = ( ftp| svn| svn\+ ssh| http| https| mailto) :\/\/ [^ \t\n\f\r <>|`\p{OpenPunctuation}\p{InitialPunctuation}\p{ClosePunctuation}\p{FinalPunctuation}] + [^ \t\n\f\r <>|.!?¡¿,·;:&`\"\'\* \p{OpenPunctuation}\p{InitialPunctuation}\p{ClosePunctuation}\p{FinalPunctuation}-]
@@ -963,13 +959,13 @@ CP1252_MISC_SYMBOL = [\u0086\u0087\u0089\u0095\u0098\u0099]
963959 if (DEBUG ) { logger. info(" Used {TWITTER} to recognize " + tok); }
964960 return getNext(tok, tok);
965961 }
966- {REDAUX} / [^\p{Alpha } '’] { String tok = yytext();
962+ {REDAUX} / [^\p{Latin } '’] { String tok = yytext();
967963 String norm = LexerUtils . handleQuotes(tok, false , quoteStyle);
968964 if (DEBUG ) { logger. info(" Used {REDAUX} to recognize " + tok + " as " + norm +
969965 " ; probablyLeft=" + false ); }
970966 return getNext(norm, tok);
971967 }
972- {SREDAUX} / [^\p{Alpha } '’] { String tok = yytext();
968+ {SREDAUX} / [^\p{Latin } '’] { String tok = yytext();
973969 String norm = LexerUtils . handleQuotes(tok, false , quoteStyle);
974970 if (DEBUG ) { logger. info(" Used {SREDAUX} to recognize " + tok + " as " + norm +
975971 " ; probablyLeft=" + false ); }
@@ -1073,7 +1069,7 @@ RM/{NUM} { String txt = yytext();
10731069 }
10741070{DOLSIGN} { String txt = yytext();
10751071 if (DEBUG ) { logger. info(" Used {DOLSIGN} to recognize " + txt); }
1076- return getNext(txt, txt);
1072+ return getNext(txt, txt);
10771073 }
10781074{DOLSIGN2} { String txt = yytext();
10791075 String normTok;
@@ -1100,26 +1096,49 @@ RM/{NUM} { String txt = yytext();
11001096<YyTokenizePerLine> {ABBREV3} / {SPACENL} ?[:digit:] {
11011097 return processAbbrev3();
11021098 }
1103- <YyNotTokenizePerLine> {ABBREVSN} / {SPACENL} +( Africa| Korea| Cal) { return getNext(); }
1104- <YyTokenizePerLine> {ABBREVSN} / {SPACE} +( Africa| Korea| Cal) { return getNext(); }
1099+ <YyNotTokenizePerLine> {ABBREVSN} / {SPACENL} +( Africa| Korea| Cal) {
1100+ String txt = yytext();
1101+ if (DEBUG ) { logger. info(" Used {N/S Place} to recognize " + txt); }
1102+ return getNext(txt, txt);
1103+ }
1104+ <YyTokenizePerLine> {ABBREVSN} / {SPACE} +( Africa| Korea| Cal) {
1105+ String txt = yytext();
1106+ if (DEBUG ) { logger. info(" Used {N/S Place} (2) to recognize " + txt); }
1107+ return getNext(txt, txt);
1108+ }
11051109/* Special case to get pty. ltd. or pty limited. Also added "Co." since someone complained, but usually a comma after it. */
1106- ( pty| pte| pvt| co) \. / {SPACE} ( ltd| lim| llc) { return getNext(); }
1110+ ( pty| pte| pvt| co) \. / {SPACE} ( ltd| lim| llc) {
1111+ String txt = yytext();
1112+ if (DEBUG ) { logger. info(" Used {pty ltd} to recognize " + txt); }
1113+ return getNext(txt, txt); }
11071114/* Special case to get op. cit.. or loc. cit. */
1108- ( op| loc) \. / {SPACE} cit\. { return getNext(); }
1115+ ( op| loc) \. / {SPACE} cit\. {
1116+ String txt = yytext();
1117+ if (DEBUG ) { logger. info(" Used {op/loc cit} to recognize " + txt); }
1118+ return getNext(txt, txt); }
11091119<YyNotTokenizePerLine> {ABBREV1} / {SENTEND1} {
11101120 return processAbbrev1();
11111121 }
11121122<YyTokenizePerLine> {ABBREV1} / {SENTEND2} {
11131123 return processAbbrev1();
11141124 }
1115- <YyNotTokenizePerLine> {ABBREV1} s?/ [^][^] { return getNext(); }
1116- <YyTokenizePerLine> {ABBREV1} s?/ [^\r\n][^\r\n] { return getNext(); }
1117- {ABBREV1} s? { // this one should only match if we're basically at the end of file
1125+ <YyNotTokenizePerLine> {ABBREV1} s?/ [^][^] {
1126+ String txt = yytext();
1127+ if (DEBUG ) { logger. info(" Used {ABBREV1 pl} to recognize " + txt); }
1128+ return getNext(txt, txt);
1129+ }
1130+ <YyTokenizePerLine> {ABBREV1} s?/ [^\r\n][^\r\n] {
1131+ String txt = yytext();
1132+ if (DEBUG ) { logger. info(" Used {ABBREV1 pl} (2) to recognize " + txt); }
1133+ return getNext(txt, txt);
1134+ }
1135+ {ABBREV1} s? {
1136+ // this one should only match if we're basically at the end of file
11181137 // since the last one matches two things, even newlines (if not tokenize per line)
11191138 return processAbbrev1();
11201139 }
11211140{ABBREV2} s? { String tok = yytext();
1122- if (DEBUG ) { logger. info(" Used {ABBREV2} to recognize " + tok); }
1141+ if (DEBUG ) { logger. info(" Used {ABBREV2 pl } to recognize " + tok); }
11231142 return getNext(tok, tok);
11241143 }
11251144/* Last millennium (in the WSJ) "Alex." is generally an abbreviation for Alex. Brown, brokers! Recognize just this case. */
@@ -1140,20 +1159,44 @@ RM/{NUM} { String txt = yytext();
11401159 if (DEBUG ) { logger. info(" Used {ABBREV4} to recognize " + tok); }
11411160 return getNext(tok, tok);
11421161 }
1143- {TBSPEC2} / {SPACENL} { return getNext(); }
1144- {ISO8601DATETIME} { return getNext(); }
1162+ {TBSPEC2} / {SPACENL} {
1163+ String txt = yytext();
1164+ if (DEBUG ) { logger. info(" Used {TBSPEC2} to recognize " + txt); }
1165+ return getNext(txt, txt);
1166+ }
1167+ {ISO8601DATETIME} {
1168+ String txt = yytext();
1169+ if (DEBUG ) { logger. info(" Used {ISO8601DATETIME} to recognize " + txt); }
1170+ return getNext(txt, txt);
1171+ }
11451172// {ISO8601DATE} { return getNext(); }
1146- {DEGREES} { return getNext(); }
1173+ {DEGREES} {
1174+ String txt = yytext();
1175+ if (DEBUG ) { logger. info(" Used {DEGREES} to recognize " + txt); }
1176+ return getNext(txt, txt);
1177+ }
11471178/* Ideally would factor this out for use in other tokenizers,
11481179 * but the other tokenizers don't have TokenizerPerLine options */
1149- <YyNotTokenizePerLine> {FILENAME} /( {SPACENL} | [ .?!,\" '<()] ) { return getNext(); }
1150- <YyTokenizePerLine> {FILENAME} /( {SPACE} | [ .?!,\" '<()] ) { return getNext(); }
1180+ <YyNotTokenizePerLine> {FILENAME} /( {SPACENL} | [ .?!,\" '<()] ) {
1181+ String txt = yytext();
1182+ if (DEBUG ) { logger. info(" Used {FILENAME} to recognize " + txt); }
1183+ return getNext(txt, txt);
1184+ }
1185+ <YyTokenizePerLine> {FILENAME} /( {SPACE} | [ .?!,\" '<()] ) {
1186+ String txt = yytext();
1187+ if (DEBUG ) { logger. info(" Used {FILENAME} (2) to recognize " + txt); }
1188+ return getNext(txt, txt);
1189+ }
11511190{WORD} \. / {INSENTP} { String origTok = yytext();
11521191 String norm = LexerUtils . removeSoftHyphens(origTok);
11531192 if (DEBUG ) { logger. info(" Used {WORD} (3) to recognize " + origTok + " as " + norm); }
11541193 return getNext(norm, origTok);
11551194 }
1156- {SSN} { return getNext(); }
1195+ {SSN} {
1196+ String txt = yytext();
1197+ if (DEBUG ) { logger. info(" Used {SSN} to recognize " + txt); }
1198+ return getNext(txt, txt);
1199+ }
11571200{PHONE} { String txt = yytext();
11581201 String norm = txt;
11591202 if (normalizeSpace) {
@@ -1184,48 +1227,81 @@ RM/{NUM} { String txt = yytext();
11841227{ASIANSMILEY} { String txt = yytext();
11851228 String origText = txt;
11861229 txt = LexerUtils . pennNormalizeParens(txt, normalizeParentheses);
1230+ if (DEBUG ) { logger. info(" Used {ASIANSMILEY} to recognize " + origText + " as " + txt); }
11871231 return getNext(txt, origText);
11881232 }
11891233{EMOJI} { String txt = yytext();
11901234 if (DEBUG ) { logger. info(" Used {EMOJI} to recognize " + txt); }
11911235 return getNext(txt, txt);
11921236 }
1193- {LESSTHAN} { return getNext(" <" , yytext()); }
1194- {GREATERTHAN} { return getNext(" >" , yytext()); }
1195- \{ { if (normalizeOtherBrackets) {
1196- return getNext(openbrace, yytext()); }
1237+ {LESSTHAN} {
1238+ String txt = yytext();
1239+ if (DEBUG ) { logger. info(" Used {LESSTHAN} to recognize " + txt + " as <" ); }
1240+ return getNext(" <" , yytext());
1241+ }
1242+ {GREATERTHAN} {
1243+ String txt = yytext();
1244+ if (DEBUG ) { logger. info(" Used {GREATERTHAN} to recognize " + txt + " as >" ); }
1245+ return getNext(" >" , yytext());
1246+ }
1247+ \{ {
1248+ String txt = yytext();
1249+ if (normalizeOtherBrackets) {
1250+ if (DEBUG ) { logger. info(" Used {{} to recognize " + txt + " as " + openbrace); }
1251+ return getNext(openbrace, txt); }
11971252 else {
1198- return getNext();
1253+ if (DEBUG ) { logger. info(" Used {{} to recognize " + txt); }
1254+ return getNext(txt, txt);
11991255 }
12001256 }
1201- \} { if (normalizeOtherBrackets) {
1202- return getNext(closebrace, yytext()); }
1257+ \} {
1258+ String txt = yytext();
1259+ if (normalizeOtherBrackets) {
1260+ if (DEBUG ) { logger. info(" Used {}} to recognize " + txt + " as " + closebrace); }
1261+ return getNext(closebrace, txt); }
12031262 else {
1204- return getNext();
1263+ if (DEBUG ) { logger. info(" Used {}} to recognize " + txt); }
1264+ return getNext(txt, txt);
12051265 }
12061266 }
1207- \[ { if (normalizeOtherBrackets) {
1208- return getNext(" -LSB-" , yytext()); }
1267+ \[ {
1268+ String txt = yytext();
1269+ if (normalizeOtherBrackets) {
1270+ if (DEBUG ) { logger. info(" Used {[} to recognize " + txt + " as " + " -LSB-" ); }
1271+ return getNext(" -LSB-" , txt); }
12091272 else {
1210- return getNext();
1273+ if (DEBUG ) { logger. info(" Used {[} to recognize " + txt); }
1274+ return getNext(txt, txt);
12111275 }
12121276 }
1213- \] { if (normalizeOtherBrackets) {
1214- return getNext(" -RSB-" , yytext()); }
1277+ \] {
1278+ String txt = yytext();
1279+ if (normalizeOtherBrackets) {
1280+ if (DEBUG ) { logger. info(" Used {]} to recognize " + txt + " as " + " -RSB-" ); }
1281+ return getNext(" -RSB-" , txt); }
12151282 else {
1216- return getNext();
1283+ if (DEBUG ) { logger. info(" Used {]} to recognize " + txt); }
1284+ return getNext(txt, txt);
12171285 }
12181286 }
1219- \( { if (normalizeParentheses) {
1220- return getNext(openparen, yytext()); }
1287+ \( {
1288+ String txt = yytext();
1289+ if (normalizeParentheses) {
1290+ if (DEBUG ) { logger. info(" Used {(} to recognize " + txt + " as " + openparen); }
1291+ return getNext(openparen, txt); }
12211292 else {
1222- return getNext();
1293+ if (DEBUG ) { logger. info(" Used {(} to recognize " + txt); }
1294+ return getNext(txt, txt);
12231295 }
12241296 }
1225- \) { if (normalizeParentheses) {
1226- return getNext(closeparen, yytext()); }
1297+ \) {
1298+ String txt = yytext();
1299+ if (normalizeParentheses) {
1300+ if (DEBUG ) { logger. info(" Used {)} to recognize " + txt + " as " + closeparen); }
1301+ return getNext(closeparen, txt); }
12271302 else {
1228- return getNext();
1303+ if (DEBUG ) { logger. info(" Used {)} to recognize " + txt); }
1304+ return getNext(txt, txt);
12291305 }
12301306 }
12311307{HYPHENS} { final String origTxt = yytext();
@@ -1270,17 +1346,42 @@ RM/{NUM} { String txt = yytext();
12701346 if (DEBUG ) { logger. info(" Used {LDOTS5} to recognize " + tok + " as " + norm); }
12711347 return getNext(norm, tok);
12721348 }
1273- {FNMARKS} { return getNext(); }
1274- {ASTS} { if (escapeForwardSlashAsterisk) {
1275- return getNext(LexerUtils . escapeChar(yytext(), ' *' ), yytext()); }
1349+ {FNMARKS} {
1350+ String txt = yytext();
1351+ if (DEBUG ) { logger. info(" Used {FNMARKS} to recognize " + txt); }
1352+ return getNext(txt, txt);
1353+ }
1354+ {ASTS} {
1355+ String txt = yytext();
1356+ if (escapeForwardSlashAsterisk) {
1357+ String normTok = LexerUtils . escapeChar(yytext(), ' *' );
1358+ if (DEBUG ) { logger. info(" Used {ASTS} to recognize " + txt + " as " + normTok); }
1359+ return getNext(normTok, yytext()); }
12761360 else {
1277- return getNext();
1361+ if (DEBUG ) { logger. info(" Used {ASTS} to recognize " + txt); }
1362+ return getNext(txt, txt);
12781363 }
12791364 }
1280- {INSENTP} { return getNext(); }
1281- [ ?!] +| [ \u2047\u2048 ] { return getNext(); }
1282- [ .¡¿\u037E\u0589\u061F\u06D4\u0700 - \u0702\u07FA\u3002 ] { return getNext(); }
1283- =+ { return getNext(); }
1365+ {INSENTP} {
1366+ String txt = yytext();
1367+ if (DEBUG ) { logger. info(" Used {INSENTP} to recognize " + txt); }
1368+ return getNext(txt, txt);
1369+ }
1370+ [ ?!] +| [ \u2047\u2048 ] {
1371+ String txt = yytext();
1372+ if (DEBUG ) { logger. info(" Used {[?!]+]} to recognize " + txt); }
1373+ return getNext(txt, txt);
1374+ }
1375+ [ .¡¿\u037E\u0589\u061F\u06D4\u0700 - \u0702\u07FA\u3002 ] {
1376+ String txt = yytext();
1377+ if (DEBUG ) { logger. info(" Used {sent end punct} to recognize " + txt); }
1378+ return getNext(txt, txt);
1379+ }
1380+ =+ {
1381+ String txt = yytext();
1382+ if (DEBUG ) { logger. info(" Used {=} to recognize " + txt); }
1383+ return getNext(txt, txt);
1384+ }
12841385\/ { if (escapeForwardSlashAsterisk) {
12851386 return getNext(LexerUtils . escapeChar(yytext(), ' /' ), yytext()); }
12861387 else {
@@ -1392,7 +1493,11 @@ RM/{NUM} { String txt = yytext();
13921493 }
13931494
13941495{FAKEDUCKFEET} { return getNext(); }
1395- {MISCSYMBOL} { return getNext(); }
1496+ {MISCSYMBOL} {
1497+ String tok = yytext();
1498+ if (DEBUG ) { logger. info(" Used {MISCSYMBOL} to recognize " + tok); }
1499+ return getNext(tok, tok);
1500+ }
13961501{CP1252_MISC_SYMBOL} { String tok = yytext();
13971502 String norm = LexerUtils . processCp1252misc(tok);
13981503 if (DEBUG ) { logger. info(" Used {CP1252_MISC_SYMBOL} to recognize " + tok + " as " + norm); }
@@ -1453,9 +1558,9 @@ RM/{NUM} { String txt = yytext();
14531558<<EOF>> { if (invertible) {
14541559 // prevWordAfter.append(yytext());
14551560 String str = prevWordAfter. toString();
1456- if (DEBUG ) { logger. info(" At end of text making after: |" + str + " |" ); }
1561+ // if (DEBUG) { logger.info("At end of text making after: |" + str + "|"); }
14571562 prevWord. set(CoreAnnotations . AfterAnnotation . class, str);
1458- if (DEBUG ) { logger. info(" prevWord is |" + prevWord. get(CoreAnnotations . TextAnnotation . class) + " |, its after is " +
1563+ if (DEBUG ) { logger. info(" At end of text, prevWord is |" + prevWord. get(CoreAnnotations . TextAnnotation . class) + " |, its after set to " +
14591564 " |" + prevWord. get(CoreAnnotations . AfterAnnotation . class) + " |" ); }
14601565 prevWordAfter. setLength(0 );
14611566 }
0 commit comments