Skip to content

Commit e84e6e7

Browse files
authored
Improve greedy repeat support (#639)
1 parent 4d51186 commit e84e6e7

File tree

5 files changed

+212
-58
lines changed

5 files changed

+212
-58
lines changed

src/pcre2_jit_compile.c

Lines changed: 130 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -1769,7 +1769,7 @@ switch(*cc)
17691769
if (max == 0)
17701770
return (*cc == OP_CRRANGE) ? 2 : 1;
17711771
max -= min;
1772-
if (max > 2)
1772+
if (max > (*cc == OP_CRRANGE ? 0 : 1))
17731773
max = 2;
17741774
return max;
17751775

@@ -10467,10 +10467,12 @@ else
1046710467
if (*max == 0)
1046810468
{
1046910469
SLJIT_ASSERT(*exact > 1);
10470-
if (*opcode == OP_CRPOSRANGE)
10470+
if (*opcode == OP_CRRANGE)
10471+
*opcode = OP_UPTO;
10472+
else if (*opcode == OP_CRPOSRANGE)
1047110473
*opcode = OP_POSUPTO;
1047210474
else
10473-
*opcode -= OP_CRRANGE - OP_STAR;
10475+
*opcode = OP_MINSTAR;
1047410476
return cc;
1047510477
}
1047610478

@@ -10480,12 +10482,14 @@ else
1048010482
else
1048110483
{
1048210484
SLJIT_ASSERT(*exact > 0 || *max > 1);
10483-
if (*opcode == OP_CRPOSRANGE)
10485+
if (*opcode == OP_CRRANGE)
10486+
*opcode = OP_UPTO;
10487+
else if (*opcode == OP_CRPOSRANGE)
1048410488
*opcode = OP_POSUPTO;
1048510489
else if (*max == 1)
10486-
*opcode -= OP_CRRANGE - OP_QUERY;
10490+
*opcode = OP_MINQUERY;
1048710491
else
10488-
*opcode -= OP_CRRANGE - OP_UPTO;
10492+
*opcode = OP_MINUPTO;
1048910493
}
1049010494
return cc;
1049110495
}
@@ -10586,7 +10590,7 @@ else
1058610590
}
1058710591

1058810592
/* Handle fixed part first. */
10589-
if (opcode != OP_POSUPTO)
10593+
if (opcode != OP_UPTO && opcode != OP_POSUPTO)
1059010594
{
1059110595
if (exact > 1)
1059210596
{
@@ -10646,11 +10650,9 @@ if (early_fail_type == type_fail_range)
1064610650
switch(opcode)
1064710651
{
1064810652
case OP_UPTO:
10649-
/* Exact is ignored for upto. */
10650-
exact = 0;
10651-
/* Fall through */
1065210653
case OP_STAR:
1065310654
SLJIT_ASSERT(early_fail_ptr == 0 || opcode == OP_STAR);
10655+
max += exact;
1065410656

1065510657
if (type == OP_EXTUNI)
1065610658
{
@@ -10665,6 +10667,9 @@ switch(opcode)
1066510667
}
1066610668
else
1066710669
{
10670+
/* If OP_EXTUNI is present, it has a separate EXACT opcode. */
10671+
SLJIT_ASSERT(exact == 0);
10672+
1066810673
allocate_stack(common, 2);
1066910674
OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(0), STR_PTR, 0);
1067010675
OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(1), SLJIT_IMM, 0);
@@ -10726,6 +10731,9 @@ switch(opcode)
1072610731
else
1072710732
#endif
1072810733
{
10734+
/* If OP_ALLANY is present, it has a separate EXACT opcode. */
10735+
SLJIT_ASSERT(exact == 0);
10736+
1072910737
if (private_data_ptr == 0)
1073010738
allocate_stack(common, 2);
1073110739

@@ -10794,71 +10802,86 @@ switch(opcode)
1079410802
if (private_data_ptr == 0)
1079510803
allocate_stack(common, 2);
1079610804

10797-
use_tmp = (!HAS_VIRTUAL_REGISTERS && opcode == OP_STAR);
10805+
use_tmp = (opcode == OP_STAR);
1079810806

1079910807
if (use_tmp)
1080010808
{
10801-
OP1(SLJIT_MOV, tmp_base, tmp_offset, SLJIT_IMM, 0);
10802-
OP1(SLJIT_MOV, base, offset0, tmp_base, tmp_offset);
10809+
OP1(SLJIT_MOV, TMP3, 0, SLJIT_IMM, 0);
10810+
OP1(SLJIT_MOV, base, offset0, TMP3, 0);
1080310811
}
1080410812
else
1080510813
{
10806-
OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0);
10807-
OP1(SLJIT_MOV, base, offset0, TMP1, 0);
10808-
OP1(SLJIT_MOV, base, offset1, TMP1, 0);
10809-
if (opcode == OP_UPTO)
10810-
OP1(SLJIT_MOV, tmp_base, tmp_offset, SLJIT_IMM, max + 1);
10814+
OP1(SLJIT_MOV, base, offset1, COUNT_MATCH, 0);
10815+
OP1(SLJIT_MOV, COUNT_MATCH, 0, SLJIT_IMM, 0);
10816+
OP1(SLJIT_MOV, base, offset0, COUNT_MATCH, 0);
10817+
OP1(SLJIT_MOV, TMP3, 0, SLJIT_IMM, exact == max ? 0 : (max + 1));
1081110818
}
1081210819

1081310820
/* Search the first instance of charpos_char. */
10814-
if (exact == 1)
10815-
{
10816-
SLJIT_ASSERT(opcode == OP_STAR);
10821+
if (exact > 0)
1081710822
detect_partial_match(common, &no_match);
10818-
}
1081910823
else
1082010824
jump = JUMP(SLJIT_JUMP);
1082110825

1082210826
label = LABEL();
10827+
1082310828
if (opcode == OP_UPTO)
1082410829
{
10825-
OP2(SLJIT_SUB | SLJIT_SET_Z, tmp_base, tmp_offset, tmp_base, tmp_offset, SLJIT_IMM, 1);
10826-
add_jump(compiler, &no_match, JUMP(SLJIT_ZERO));
10830+
if (exact == max)
10831+
OP2(SLJIT_ADD, TMP3, 0, TMP3, 0, SLJIT_IMM, 1);
10832+
else
10833+
{
10834+
OP2(SLJIT_SUB | SLJIT_SET_Z, TMP3, 0, TMP3, 0, SLJIT_IMM, 1);
10835+
add_jump(compiler, &no_match, JUMP(SLJIT_ZERO));
10836+
}
1082710837
}
10838+
1082810839
compile_char1_matchingpath(common, type, cc, &no_match, FALSE);
1082910840

1083010841
if (early_fail_ptr != 0)
1083110842
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), early_fail_ptr, STR_PTR, 0);
1083210843

10833-
if (exact != 1)
10844+
if (exact == 0)
1083410845
JUMPHERE(jump);
1083510846

1083610847
detect_partial_match(common, &no_match);
10848+
10849+
if (opcode == OP_UPTO && exact > 0)
10850+
{
10851+
if (exact == max)
10852+
CMPTO(SLJIT_LESS, TMP3, 0, SLJIT_IMM, exact, label);
10853+
else
10854+
CMPTO(SLJIT_GREATER, TMP3, 0, SLJIT_IMM, (max + 1) - exact, label);
10855+
}
10856+
1083710857
OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
1083810858
if (charpos_othercasebit != 0)
1083910859
OP2(SLJIT_OR, TMP1, 0, TMP1, 0, SLJIT_IMM, charpos_othercasebit);
1084010860
CMPTO(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, charpos_char, label);
1084110861

10862+
OP1(SLJIT_MOV, base, offset0, STR_PTR, 0);
1084210863
if (use_tmp)
1084310864
{
10844-
OP1(SLJIT_MOV, base, offset0, STR_PTR, 0);
10845-
OP2U(SLJIT_SUB | SLJIT_SET_Z, tmp_base, tmp_offset, SLJIT_IMM, 0);
10846-
SELECT(SLJIT_EQUAL, tmp_base, STR_PTR, 0, tmp_base);
10865+
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP3, 0, SLJIT_IMM, 0);
10866+
SELECT(SLJIT_EQUAL, TMP3, STR_PTR, 0, TMP3);
1084710867
}
1084810868
else
1084910869
{
10850-
OP1(SLJIT_MOV, TMP2, 0, base, offset1);
10851-
OP1(SLJIT_MOV, base, offset0, STR_PTR, 0);
10852-
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP2, 0, SLJIT_IMM, 0);
10853-
SELECT(SLJIT_EQUAL, TMP2, STR_PTR, 0, TMP2);
10854-
OP1(SLJIT_MOV, base, offset1, TMP2, 0);
10870+
OP2U(SLJIT_SUB | SLJIT_SET_Z, COUNT_MATCH, 0, SLJIT_IMM, 0);
10871+
SELECT(SLJIT_EQUAL, COUNT_MATCH, STR_PTR, 0, COUNT_MATCH);
1085510872
}
1085610873
JUMPTO(SLJIT_JUMP, label);
1085710874

1085810875
set_jumps(no_match, LABEL());
1085910876
OP1(SLJIT_MOV, STR_PTR, 0, base, offset0);
1086010877
if (use_tmp)
10861-
OP1(SLJIT_MOV, base, offset1, tmp_base, tmp_offset);
10878+
OP1(SLJIT_MOV, base, offset1, TMP3, 0);
10879+
else
10880+
{
10881+
OP1(SLJIT_MOV, TMP1, 0, base, offset1);
10882+
OP1(SLJIT_MOV, base, offset1, COUNT_MATCH, 0);
10883+
OP1(SLJIT_MOV, COUNT_MATCH, 0, TMP1, 0);
10884+
}
1086210885

1086310886
add_jump(compiler, &backtrack->own_backtracks, CMP(SLJIT_EQUAL, STR_PTR, 0, SLJIT_IMM, 0));
1086410887

@@ -10872,60 +10895,110 @@ switch(opcode)
1087210895
if (private_data_ptr == 0)
1087310896
allocate_stack(common, 2);
1087410897

10875-
OP1(SLJIT_MOV, base, offset1, STR_PTR, 0);
1087610898
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
10877-
use_tmp = (!HAS_VIRTUAL_REGISTERS && opcode == OP_STAR);
10878-
SLJIT_ASSERT(!use_tmp || tmp_base == TMP3);
10899+
use_tmp = (opcode == OP_STAR);
1087910900

1088010901
if (common->utf)
10881-
OP1(SLJIT_MOV, use_tmp ? TMP3 : base, use_tmp ? 0 : offset0, STR_PTR, 0);
10902+
{
10903+
if (!use_tmp)
10904+
OP1(SLJIT_MOV, base, offset0, COUNT_MATCH, 0);
10905+
10906+
OP1(SLJIT_MOV, use_tmp ? TMP3 : COUNT_MATCH, 0, STR_PTR, 0);
10907+
}
1088210908
#endif
10909+
1088310910
if (opcode == OP_UPTO)
10884-
OP1(SLJIT_MOV, tmp_base, tmp_offset, SLJIT_IMM, max);
10911+
OP1(SLJIT_MOV, TMP3, 0, SLJIT_IMM, exact == max ? -(sljit_sw)exact : (sljit_sw)max);
1088510912

10886-
detect_partial_match(common, &no_match);
10887-
label = LABEL();
10888-
compile_char1_matchingpath(common, type, cc, &no_char1_match, FALSE);
10913+
if (opcode == OP_UPTO && exact > 0)
10914+
{
10915+
label = LABEL();
10916+
detect_partial_match(common, &no_match);
10917+
compile_char1_matchingpath(common, type, cc, &no_char1_match, FALSE);
1088910918
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
10890-
if (common->utf)
10891-
OP1(SLJIT_MOV, use_tmp ? TMP3 : base, use_tmp ? 0 : offset0, STR_PTR, 0);
10919+
if (common->utf)
10920+
OP1(SLJIT_MOV, use_tmp ? TMP3 : COUNT_MATCH, 0, STR_PTR, 0);
1089210921
#endif
1089310922

10894-
if (opcode == OP_UPTO)
10895-
{
10896-
OP2(SLJIT_SUB | SLJIT_SET_Z, tmp_base, tmp_offset, tmp_base, tmp_offset, SLJIT_IMM, 1);
10897-
add_jump(compiler, &no_match, JUMP(SLJIT_ZERO));
10923+
if (exact == max)
10924+
{
10925+
OP2(SLJIT_ADD | SLJIT_SET_Z, TMP3, 0, TMP3, 0, SLJIT_IMM, 1);
10926+
JUMPTO(SLJIT_NOT_ZERO, label);
10927+
}
10928+
else
10929+
{
10930+
OP2(SLJIT_SUB | SLJIT_SET_Z, TMP3, 0, TMP3, 0, SLJIT_IMM, 1);
10931+
add_jump(compiler, &no_match, JUMP(SLJIT_ZERO));
10932+
CMPTO(SLJIT_NOT_EQUAL, TMP3, 0, SLJIT_IMM, max - exact, label);
10933+
}
10934+
10935+
OP1(SLJIT_MOV, base, offset1, STR_PTR, 0);
10936+
JUMPTO(SLJIT_JUMP, label);
1089810937
}
10938+
else
10939+
{
10940+
OP1(SLJIT_MOV, base, offset1, STR_PTR, 0);
1089910941

10900-
detect_partial_match_to(common, label);
10901-
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
10942+
detect_partial_match(common, &no_match);
10943+
label = LABEL();
10944+
compile_char1_matchingpath(common, type, cc, &no_char1_match, FALSE);
10945+
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
10946+
if (common->utf)
10947+
OP1(SLJIT_MOV, use_tmp ? TMP3 : COUNT_MATCH, 0, STR_PTR, 0);
10948+
#endif
10949+
10950+
if (opcode == OP_UPTO)
10951+
{
10952+
OP2(SLJIT_SUB | SLJIT_SET_Z, TMP3, 0, TMP3, 0, SLJIT_IMM, 1);
10953+
add_jump(compiler, &no_match, JUMP(SLJIT_ZERO));
10954+
}
10955+
10956+
detect_partial_match_to(common, label);
10957+
}
1090210958

10903-
set_jumps(no_char1_match, LABEL());
1090410959
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1090510960
if (common->utf)
1090610961
{
10962+
set_jumps(no_char1_match, LABEL());
1090710963
set_jumps(no_match, LABEL());
1090810964
if (use_tmp)
1090910965
{
1091010966
OP1(SLJIT_MOV, STR_PTR, 0, TMP3, 0);
1091110967
OP1(SLJIT_MOV, base, offset0, TMP3, 0);
1091210968
}
1091310969
else
10914-
OP1(SLJIT_MOV, STR_PTR, 0, base, offset0);
10970+
{
10971+
OP1(SLJIT_MOV, STR_PTR, 0, COUNT_MATCH, 0);
10972+
OP1(SLJIT_MOV, COUNT_MATCH, 0, base, offset0);
10973+
OP1(SLJIT_MOV, base, offset0, STR_PTR, 0);
10974+
}
1091510975
}
1091610976
else
1091710977
#endif
1091810978
{
10979+
if (opcode != OP_UPTO || exact == 0)
10980+
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
10981+
set_jumps(no_char1_match, LABEL());
10982+
1091910983
OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
1092010984
set_jumps(no_match, LABEL());
1092110985
OP1(SLJIT_MOV, base, offset0, STR_PTR, 0);
1092210986
}
1092310987

10924-
if (exact == 1)
10988+
if (opcode == OP_UPTO)
1092510989
{
10926-
SLJIT_ASSERT(opcode == OP_STAR);
10927-
add_jump(compiler, &BACKTRACK_AS(char_iterator_backtrack)->u.backtracks, CMP(SLJIT_EQUAL, base, offset1, STR_PTR, 0));
10990+
if (exact > 0)
10991+
{
10992+
if (max == exact)
10993+
jump = CMP(SLJIT_GREATER_EQUAL, TMP3, 0, SLJIT_IMM, -(sljit_sw)exact);
10994+
else
10995+
jump = CMP(SLJIT_GREATER, TMP3, 0, SLJIT_IMM, max - exact);
10996+
10997+
add_jump(compiler, &BACKTRACK_AS(char_iterator_backtrack)->u.backtracks, jump);
10998+
}
1092810999
}
11000+
else if (exact == 1)
11001+
add_jump(compiler, &BACKTRACK_AS(char_iterator_backtrack)->u.backtracks, CMP(SLJIT_EQUAL, base, offset1, STR_PTR, 0));
1092911002

1093011003
if (early_fail_ptr != 0)
1093111004
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), early_fail_ptr, STR_PTR, 0);
@@ -11052,8 +11125,8 @@ switch(opcode)
1105211125

1105311126
case OP_POSUPTO:
1105411127
SLJIT_ASSERT(early_fail_ptr == 0);
11055-
1105611128
max += exact;
11129+
1105711130
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1105811131
if (type == OP_EXTUNI || common->utf)
1105911132
#else
@@ -11738,8 +11811,7 @@ switch(opcode)
1173811811
OP1(SLJIT_MOV, base, offset0, STR_PTR, 0);
1173911812
JUMPTO(SLJIT_JUMP, CURRENT_AS(char_iterator_backtrack)->matchingpath);
1174011813

11741-
if (opcode == OP_STAR && exact == 1)
11742-
set_jumps(CURRENT_AS(char_iterator_backtrack)->u.backtracks, LABEL());
11814+
set_jumps(CURRENT_AS(char_iterator_backtrack)->u.backtracks, LABEL());
1174311815
}
1174411816

1174511817
JUMPHERE(jump);

testdata/testinput1

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7043,6 +7043,23 @@ $/x
70437043
ab
70447044
a
70457045

7046+
/^(.{3,6}!)+$/s
7047+
abc!defghi!
7048+
abcdef!ghi!
7049+
abc!def!ghi!jkl!
7050+
ab!cd!
7051+
\= Expect no match
7052+
abcd!ef!
7053+
ab!cdefg!
7054+
7055+
/[a-z]{5,}b|x/
7056+
abcdefghbijb
7057+
abcdefghbij
7058+
abcdeb
7059+
\= Expect no match
7060+
abcdb
7061+
abcdefghijk
7062+
70467063
# --------------
70477064

70487065
# End of testinput1

testdata/testinput4

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1856,6 +1856,19 @@
18561856
/[z\x{017f}]+/i,utf
18571857
\x{0053}\x{0073}\x{017f}
18581858

1859+
/^[a-z\x{500}-\x{1000}]{3,}[a-h]|x/utf
1860+
ab\x{600}ijklmh
1861+
ab\x{600}hijklm
1862+
\= Expect no match
1863+
ab\x{600}ijklm
1864+
1865+
/^[a-z\x{500}-\x{1000}]{4,7}[a-h]|x/utf
1866+
ab\x{600}\x{700}ijkh
1867+
ab\x{600}\x{700}hijkl
1868+
\= Expect no match
1869+
ab\x{600}\x{700}ijklh
1870+
ab\x{600}h\x{700}ijklmh
1871+
18591872
# --------------------------------------
18601873

18611874
/(ΣΆΜΟΣ) \1/i,utf

0 commit comments

Comments
 (0)