Skip to content

Commit 782459c

Browse files
authored
Improve possessive repeat support (#632)
1 parent 09c07ac commit 782459c

File tree

6 files changed

+213
-111
lines changed

6 files changed

+213
-111
lines changed

src/pcre2_jit_char_inc.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2007,7 +2007,7 @@ switch(type)
20072007
if (common->mode == PCRE2_JIT_PARTIAL_HARD)
20082008
{
20092009
jump[0] = CMP(SLJIT_LESS, SLJIT_RETURN_REG, 0, STR_END, 0);
2010-
/* Since we successfully read a char above, partial matching must occure. */
2010+
/* Since we successfully read a char above, partial matching must occur. */
20112011
check_partial(common, TRUE);
20122012
JUMPHERE(jump[0]);
20132013
}

src/pcre2_jit_compile.c

Lines changed: 147 additions & 110 deletions
Original file line numberDiff line numberDiff line change
@@ -1184,13 +1184,8 @@ while (cc < ccend)
11841184

11851185
case OP_TYPEPOSUPTO:
11861186
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1187-
if (common->utf)
1188-
{
1189-
if (cc[1 + IMM2_SIZE] == OP_EXTUNI && locals_size <= 4 * SSIZE_OF(sw))
1190-
locals_size = 4 * SSIZE_OF(sw);
1191-
else if (locals_size <= 3 * SSIZE_OF(sw))
1192-
locals_size = 3 * SSIZE_OF(sw);
1193-
}
1187+
if (common->utf && locals_size <= 3 * SSIZE_OF(sw))
1188+
locals_size = 3 * SSIZE_OF(sw);
11941189
#endif
11951190
if (cc[1 + IMM2_SIZE] == OP_EXTUNI && locals_size <= 3 * SSIZE_OF(sw))
11961191
locals_size = 3 * SSIZE_OF(sw);
@@ -1307,7 +1302,8 @@ while (cc < ccend)
13071302

13081303
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
13091304
case OP_CRPOSRANGE:
1310-
if (GET2(cc, 1) < GET2(cc, 1 + IMM2_SIZE) && locals_size <= 3 * SSIZE_OF(sw))
1305+
/* The second value can be 0 for infinite repeats. */
1306+
if (common->utf && GET2(cc, 1) != GET2(cc, 1 + IMM2_SIZE) && locals_size <= 3 * SSIZE_OF(sw))
13111307
locals_size = 3 * SSIZE_OF(sw);
13121308
cc += 1 + 2 * IMM2_SIZE;
13131309
break;
@@ -10447,8 +10443,10 @@ else
1044710443
*exact = 1;
1044810444
*opcode -= OP_PLUS - OP_STAR;
1044910445
}
10446+
return cc;
1045010447
}
10451-
else if (*opcode >= OP_CRPOSSTAR && *opcode <= OP_CRPOSQUERY)
10448+
10449+
if (*opcode >= OP_CRPOSSTAR && *opcode <= OP_CRPOSQUERY)
1045210450
{
1045310451
*opcode -= OP_CRPOSSTAR - OP_POSSTAR;
1045410452
*end = cc + class_len;
@@ -10458,41 +10456,36 @@ else
1045810456
*exact = 1;
1045910457
*opcode = OP_POSSTAR;
1046010458
}
10459+
return cc;
1046110460
}
10462-
else
10461+
10462+
SLJIT_ASSERT(*opcode == OP_CRRANGE || *opcode == OP_CRMINRANGE || *opcode == OP_CRPOSRANGE);
10463+
*max = GET2(cc, (class_len + IMM2_SIZE));
10464+
*exact = GET2(cc, class_len);
10465+
*end = cc + class_len + 2 * IMM2_SIZE;
10466+
10467+
if (*max == 0)
1046310468
{
10464-
SLJIT_ASSERT(*opcode == OP_CRRANGE || *opcode == OP_CRMINRANGE || *opcode == OP_CRPOSRANGE);
10465-
*max = GET2(cc, (class_len + IMM2_SIZE));
10466-
*exact = GET2(cc, class_len);
10469+
SLJIT_ASSERT(*exact > 1);
10470+
if (*opcode == OP_CRPOSRANGE)
10471+
*opcode = OP_POSUPTO;
10472+
else
10473+
*opcode -= OP_CRRANGE - OP_STAR;
10474+
return cc;
10475+
}
1046710476

10468-
if (*max == 0)
10469-
{
10470-
if (*opcode == OP_CRPOSRANGE)
10471-
*opcode = OP_POSSTAR;
10472-
else
10473-
*opcode -= OP_CRRANGE - OP_STAR;
10474-
}
10477+
*max -= *exact;
10478+
if (*max == 0)
10479+
*opcode = OP_EXACT;
10480+
else
10481+
{
10482+
SLJIT_ASSERT(*exact > 0 || *max > 1);
10483+
if (*opcode == OP_CRPOSRANGE)
10484+
*opcode = OP_POSUPTO;
10485+
else if (*max == 1)
10486+
*opcode -= OP_CRRANGE - OP_QUERY;
1047510487
else
10476-
{
10477-
*max -= *exact;
10478-
if (*max == 0)
10479-
*opcode = OP_EXACT;
10480-
else if (*max == 1)
10481-
{
10482-
if (*opcode == OP_CRPOSRANGE)
10483-
*opcode = OP_POSQUERY;
10484-
else
10485-
*opcode -= OP_CRRANGE - OP_QUERY;
10486-
}
10487-
else
10488-
{
10489-
if (*opcode == OP_CRPOSRANGE)
10490-
*opcode = OP_POSUPTO;
10491-
else
10492-
*opcode -= OP_CRRANGE - OP_UPTO;
10493-
}
10494-
}
10495-
*end = cc + class_len + 2 * IMM2_SIZE;
10488+
*opcode -= OP_CRRANGE - OP_UPTO;
1049610489
}
1049710490
return cc;
1049810491
}
@@ -10593,36 +10586,49 @@ else
1059310586
}
1059410587

1059510588
/* Handle fixed part first. */
10596-
if (exact > 1)
10589+
if (opcode != OP_POSUPTO)
1059710590
{
10598-
SLJIT_ASSERT(early_fail_ptr == 0);
10591+
if (exact > 1)
10592+
{
10593+
SLJIT_ASSERT(early_fail_ptr == 0);
1059910594

10600-
if (common->mode == PCRE2_JIT_COMPLETE
10601-
#ifdef SUPPORT_UNICODE
10602-
&& !common->utf
10595+
if (common->mode == PCRE2_JIT_COMPLETE
10596+
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
10597+
&& !common->utf
1060310598
#endif
10604-
&& type != OP_ANYNL && type != OP_EXTUNI)
10605-
{
10606-
OP2(SLJIT_ADD, TMP1, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(exact));
10607-
add_jump(compiler, &backtrack->own_backtracks, CMP(SLJIT_GREATER, TMP1, 0, STR_END, 0));
10608-
OP1(SLJIT_MOV, tmp_base, tmp_offset, SLJIT_IMM, exact);
10609-
label = LABEL();
10610-
compile_char1_matchingpath(common, type, cc, &backtrack->own_backtracks, FALSE);
10611-
OP2(SLJIT_SUB | SLJIT_SET_Z, tmp_base, tmp_offset, tmp_base, tmp_offset, SLJIT_IMM, 1);
10612-
JUMPTO(SLJIT_NOT_ZERO, label);
10599+
&& type != OP_ANYNL && type != OP_EXTUNI)
10600+
{
10601+
OP2(SLJIT_SUB, TMP1, 0, STR_END, 0, STR_PTR, 0);
10602+
add_jump(compiler, &backtrack->own_backtracks, CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, IN_UCHARS(exact)));
10603+
10604+
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 32
10605+
if (type == OP_ALLANY && !common->invalid_utf)
10606+
#else
10607+
if (type == OP_ALLANY)
10608+
#endif
10609+
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(exact));
10610+
else
10611+
{
10612+
OP1(SLJIT_MOV, tmp_base, tmp_offset, SLJIT_IMM, exact);
10613+
label = LABEL();
10614+
compile_char1_matchingpath(common, type, cc, &backtrack->own_backtracks, FALSE);
10615+
OP2(SLJIT_SUB | SLJIT_SET_Z, tmp_base, tmp_offset, tmp_base, tmp_offset, SLJIT_IMM, 1);
10616+
JUMPTO(SLJIT_NOT_ZERO, label);
10617+
}
10618+
}
10619+
else
10620+
{
10621+
SLJIT_ASSERT(tmp_base == TMP3 || common->locals_size >= 3 * SSIZE_OF(sw));
10622+
OP1(SLJIT_MOV, tmp_base, tmp_offset, SLJIT_IMM, exact);
10623+
label = LABEL();
10624+
compile_char1_matchingpath(common, type, cc, &backtrack->own_backtracks, TRUE);
10625+
OP2(SLJIT_SUB | SLJIT_SET_Z, tmp_base, tmp_offset, tmp_base, tmp_offset, SLJIT_IMM, 1);
10626+
JUMPTO(SLJIT_NOT_ZERO, label);
10627+
}
1061310628
}
10614-
else
10615-
{
10616-
SLJIT_ASSERT(tmp_base == TMP3 || common->locals_size >= 3 * SSIZE_OF(sw));
10617-
OP1(SLJIT_MOV, tmp_base, tmp_offset, SLJIT_IMM, exact);
10618-
label = LABEL();
10629+
else if (exact == 1 && opcode != OP_STAR && opcode != OP_MINSTAR && opcode != OP_POSSTAR)
1061910630
compile_char1_matchingpath(common, type, cc, &backtrack->own_backtracks, TRUE);
10620-
OP2(SLJIT_SUB | SLJIT_SET_Z, tmp_base, tmp_offset, tmp_base, tmp_offset, SLJIT_IMM, 1);
10621-
JUMPTO(SLJIT_NOT_ZERO, label);
10622-
}
1062310631
}
10624-
else if (exact == 1 && opcode != OP_STAR && opcode != OP_MINSTAR && opcode != OP_POSSTAR)
10625-
compile_char1_matchingpath(common, type, cc, &backtrack->own_backtracks, TRUE);
1062610632

1062710633
if (early_fail_type == type_fail_range)
1062810634
{
@@ -10987,29 +10993,34 @@ switch(opcode)
1098710993
}
1098810994

1098910995
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
10990-
if (type == OP_EXTUNI || common->utf)
10996+
if (common->utf)
1099110997
{
1099210998
SLJIT_ASSERT(tmp_base == TMP3 || common->locals_size >= 3 * SSIZE_OF(sw));
1099310999

10994-
OP1(SLJIT_MOV, tmp_base, tmp_offset, exact == 1 ? SLJIT_IMM : STR_PTR, 0);
11000+
if (tmp_base != TMP3)
11001+
{
11002+
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCAL2, COUNT_MATCH, 0);
11003+
tmp_base = COUNT_MATCH;
11004+
}
11005+
11006+
OP1(SLJIT_MOV, tmp_base, 0, exact == 1 ? SLJIT_IMM : STR_PTR, 0);
1099511007
detect_partial_match(common, &no_match);
1099611008
label = LABEL();
1099711009
compile_char1_matchingpath(common, type, cc, &no_match, FALSE);
10998-
OP1(SLJIT_MOV, tmp_base, tmp_offset, STR_PTR, 0);
11010+
OP1(SLJIT_MOV, tmp_base, 0, STR_PTR, 0);
1099911011
detect_partial_match_to(common, label);
1100011012

1100111013
set_jumps(no_match, LABEL());
11002-
OP1(SLJIT_MOV, STR_PTR, 0, tmp_base, tmp_offset);
11014+
OP1(SLJIT_MOV, STR_PTR, 0, tmp_base, 0);
11015+
11016+
if (tmp_base != TMP3)
11017+
OP1(SLJIT_MOV, COUNT_MATCH, 0, SLJIT_MEM1(SLJIT_SP), LOCAL2);
11018+
1100311019
if (exact == 1)
1100411020
add_jump(compiler, &backtrack->own_backtracks, CMP(SLJIT_EQUAL, STR_PTR, 0, SLJIT_IMM, 0));
1100511021

1100611022
if (early_fail_ptr != 0)
11007-
{
11008-
if (!HAS_VIRTUAL_REGISTERS && tmp_base == TMP3)
11009-
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), early_fail_ptr, TMP3, 0);
11010-
else
11011-
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), early_fail_ptr, STR_PTR, 0);
11012-
}
11023+
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), early_fail_ptr, STR_PTR, 0);
1101311024
break;
1101411025
}
1101511026
#endif
@@ -11019,12 +11030,17 @@ switch(opcode)
1101911030

1102011031
detect_partial_match(common, &no_match);
1102111032
label = LABEL();
11033+
/* Extuni never fails, so no_char1_match is not used in that case.
11034+
Anynl optionally reads an extra character on success. */
1102211035
compile_char1_matchingpath(common, type, cc, &no_char1_match, FALSE);
1102311036
detect_partial_match_to(common, label);
11024-
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
11037+
if (type != OP_EXTUNI)
11038+
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
1102511039

1102611040
set_jumps(no_char1_match, LABEL());
11027-
OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
11041+
if (type != OP_EXTUNI)
11042+
OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
11043+
1102811044
set_jumps(no_match, LABEL());
1102911045

1103011046
if (exact == 1)
@@ -11036,65 +11052,86 @@ switch(opcode)
1103611052

1103711053
case OP_POSUPTO:
1103811054
SLJIT_ASSERT(early_fail_ptr == 0);
11039-
SLJIT_ASSERT(tmp_base == TMP3 || common->locals_size >= 3 * SSIZE_OF(sw));
11055+
11056+
max += exact;
1104011057
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
11041-
if (common->utf)
11058+
if (type == OP_EXTUNI || common->utf)
11059+
#else
11060+
if (type == OP_EXTUNI)
11061+
#endif
1104211062
{
1104311063
SLJIT_ASSERT(common->locals_size >= 3 * SSIZE_OF(sw));
11044-
if (tmp_base != TMP3)
11045-
{
11046-
SLJIT_ASSERT(type == OP_EXTUNI && common->locals_size >= 4 * SSIZE_OF(sw));
11047-
tmp_offset = LOCAL3;
11048-
}
1104911064

11050-
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCAL2, STR_PTR, 0);
11051-
OP1(SLJIT_MOV, tmp_base, tmp_offset, SLJIT_IMM, max);
11065+
/* Count match is not modified by compile_char1_matchingpath. */
11066+
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCAL2, COUNT_MATCH, 0);
11067+
OP1(SLJIT_MOV, COUNT_MATCH, 0, SLJIT_IMM, exact == max ? 0 : max);
1105211068

11053-
detect_partial_match(common, &no_match);
1105411069
label = LABEL();
11055-
compile_char1_matchingpath(common, type, cc, &no_match, FALSE);
11056-
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCAL2, STR_PTR, 0);
11057-
OP2(SLJIT_SUB | SLJIT_SET_Z, tmp_base, tmp_offset, tmp_base, tmp_offset, SLJIT_IMM, 1);
11058-
add_jump(compiler, &no_match, JUMP(SLJIT_ZERO));
11059-
detect_partial_match_to(common, label);
11060-
11061-
set_jumps(no_match, LABEL());
11062-
OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(SLJIT_SP), LOCAL2);
11063-
break;
11064-
}
11065-
#endif
11066-
11067-
if (type == OP_ALLANY)
11068-
{
11069-
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(max));
11070+
/* Extuni only modifies TMP3 on successful match. */
11071+
OP1(SLJIT_MOV, TMP3, 0, STR_PTR, 0);
11072+
compile_char1_matchingpath(common, type, cc, &no_match, TRUE);
1107011073

11071-
if (common->mode == PCRE2_JIT_COMPLETE)
11074+
if (exact == max)
1107211075
{
11073-
OP2U(SLJIT_SUB | SLJIT_SET_GREATER, STR_PTR, 0, STR_END, 0);
11074-
SELECT(SLJIT_GREATER, STR_PTR, STR_END, 0, STR_PTR);
11076+
OP2(SLJIT_ADD, COUNT_MATCH, 0, COUNT_MATCH, 0, SLJIT_IMM, 1);
11077+
JUMPTO(SLJIT_JUMP, label);
1107511078
}
1107611079
else
1107711080
{
11078-
jump = CMP(SLJIT_LESS_EQUAL, STR_PTR, 0, STR_END, 0);
11079-
process_partial_match(common);
11080-
JUMPHERE(jump);
11081+
OP2(SLJIT_SUB | SLJIT_SET_Z, COUNT_MATCH, 0, COUNT_MATCH, 0, SLJIT_IMM, 1);
11082+
JUMPTO(SLJIT_NOT_ZERO, label);
11083+
OP1(SLJIT_MOV, TMP3, 0, STR_PTR, 0);
11084+
}
11085+
11086+
set_jumps(no_match, LABEL());
11087+
11088+
if (exact > 0)
11089+
{
11090+
if (exact == max)
11091+
OP2U(SLJIT_SUB | SLJIT_SET_LESS, COUNT_MATCH, 0, SLJIT_IMM, exact);
11092+
else
11093+
OP2U(SLJIT_SUB | SLJIT_SET_GREATER, COUNT_MATCH, 0, SLJIT_IMM, max - exact);
1108111094
}
11095+
11096+
OP1(SLJIT_MOV, COUNT_MATCH, 0, SLJIT_MEM1(SLJIT_SP), LOCAL2);
11097+
11098+
if (exact > 0)
11099+
add_jump(compiler, &backtrack->own_backtracks, JUMP(exact == max ? SLJIT_LESS : SLJIT_GREATER));
11100+
OP1(SLJIT_MOV, STR_PTR, 0, TMP3, 0);
1108211101
break;
1108311102
}
1108411103

11085-
OP1(SLJIT_MOV, tmp_base, tmp_offset, SLJIT_IMM, max);
11104+
SLJIT_ASSERT(tmp_base == TMP3);
11105+
11106+
OP1(SLJIT_MOV, TMP3, 0, SLJIT_IMM, exact == max ? 0 : max);
1108611107

1108711108
detect_partial_match(common, &no_match);
1108811109
label = LABEL();
1108911110
compile_char1_matchingpath(common, type, cc, &no_char1_match, FALSE);
11090-
OP2(SLJIT_SUB | SLJIT_SET_Z, tmp_base, tmp_offset, tmp_base, tmp_offset, SLJIT_IMM, 1);
11091-
add_jump(compiler, &no_match, JUMP(SLJIT_ZERO));
11111+
11112+
if (exact == max)
11113+
OP2(SLJIT_ADD, TMP3, 0, TMP3, 0, SLJIT_IMM, 1);
11114+
else
11115+
{
11116+
OP2(SLJIT_SUB | SLJIT_SET_Z, TMP3, 0, TMP3, 0, SLJIT_IMM, 1);
11117+
add_jump(compiler, &no_match, JUMP(SLJIT_ZERO));
11118+
}
1109211119
detect_partial_match_to(common, label);
1109311120
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
1109411121

1109511122
set_jumps(no_char1_match, LABEL());
1109611123
OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
1109711124
set_jumps(no_match, LABEL());
11125+
11126+
if (exact > 0)
11127+
{
11128+
if (exact == max)
11129+
jump = CMP(SLJIT_LESS, TMP3, 0, SLJIT_IMM, exact);
11130+
else
11131+
jump = CMP(SLJIT_GREATER, TMP3, 0, SLJIT_IMM, max - exact);
11132+
11133+
add_jump(compiler, &backtrack->own_backtracks, jump);
11134+
}
1109811135
break;
1109911136

1110011137
case OP_POSQUERY:

testdata/testinput1

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7034,6 +7034,15 @@ $/x
70347034
\= Expect no match
70357035
z
70367036

7037+
/^.{4}/s
7038+
abcdef
7039+
abcde
7040+
abcd
7041+
\= Expect no match
7042+
abc
7043+
ab
7044+
a
7045+
70377046
# --------------
70387047

70397048
# End of testinput1

0 commit comments

Comments
 (0)