Skip to content

Commit a2a3a6e

Browse files
authored
Improve repeat support for plus (#626)
1 parent 50811e2 commit a2a3a6e

File tree

2 files changed

+76
-23
lines changed

2 files changed

+76
-23
lines changed

src/pcre2_jit_compile.c

Lines changed: 75 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -318,9 +318,9 @@ typedef struct char_iterator_backtrack {
318318
struct {
319319
unsigned int othercasebit;
320320
PCRE2_UCHAR chr;
321-
BOOL enabled;
322321
} charpos;
323322
} u;
323+
BOOL charpos_enabled;
324324
} char_iterator_backtrack;
325325

326326
typedef struct ref_iterator_backtrack {
@@ -10624,7 +10624,7 @@ if (exact > 1)
1062410624
JUMPTO(SLJIT_NOT_ZERO, label);
1062510625
}
1062610626
}
10627-
else if (exact == 1 && opcode != OP_POSSTAR && opcode != OP_MINSTAR)
10627+
else if (exact == 1 && opcode != OP_STAR && opcode != OP_MINSTAR && opcode != OP_POSSTAR)
1062810628
compile_char1_matchingpath(common, type, cc, &backtrack->own_backtracks, TRUE);
1062910629

1063010630
if (early_fail_type == type_fail_range)
@@ -10642,18 +10642,30 @@ if (early_fail_type == type_fail_range)
1064210642

1064310643
switch(opcode)
1064410644
{
10645-
case OP_STAR:
1064610645
case OP_UPTO:
10646+
/* Exact is ignored for upto. */
10647+
exact = 0;
10648+
/* Fall through */
10649+
case OP_STAR:
1064710650
SLJIT_ASSERT(early_fail_ptr == 0 || opcode == OP_STAR);
1064810651

1064910652
if (type == OP_EXTUNI)
1065010653
{
1065110654
SLJIT_ASSERT(private_data_ptr == 0);
1065210655
SLJIT_ASSERT(early_fail_ptr == 0);
1065310656

10654-
allocate_stack(common, 2);
10655-
OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(0), STR_PTR, 0);
10656-
OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(1), SLJIT_IMM, 0);
10657+
if (exact == 1)
10658+
{
10659+
SLJIT_ASSERT(opcode == OP_STAR);
10660+
allocate_stack(common, 1);
10661+
OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(0), SLJIT_IMM, 0);
10662+
}
10663+
else
10664+
{
10665+
allocate_stack(common, 2);
10666+
OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(0), STR_PTR, 0);
10667+
OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(1), SLJIT_IMM, 0);
10668+
}
1065710669

1065810670
if (opcode == OP_UPTO)
1065910671
{
@@ -10688,6 +10700,9 @@ switch(opcode)
1068810700
{
1068910701
if (opcode == OP_STAR)
1069010702
{
10703+
if (exact == 1)
10704+
detect_partial_match(common, &backtrack->own_backtracks);
10705+
1069110706
if (private_data_ptr == 0)
1069210707
allocate_stack(common, 2);
1069310708

@@ -10768,7 +10783,7 @@ switch(opcode)
1076810783
if (charpos_othercasebit != 0)
1076910784
charpos_char |= charpos_othercasebit;
1077010785

10771-
BACKTRACK_AS(char_iterator_backtrack)->u.charpos.enabled = TRUE;
10786+
BACKTRACK_AS(char_iterator_backtrack)->charpos_enabled = TRUE;
1077210787
BACKTRACK_AS(char_iterator_backtrack)->u.charpos.chr = charpos_char;
1077310788
BACKTRACK_AS(char_iterator_backtrack)->u.charpos.othercasebit = charpos_othercasebit;
1077410789
}
@@ -10781,7 +10796,14 @@ switch(opcode)
1078110796
OP1(SLJIT_MOV, tmp_base, tmp_offset, SLJIT_IMM, max + 1);
1078210797

1078310798
/* Search the first instance of charpos_char. */
10784-
jump = JUMP(SLJIT_JUMP);
10799+
if (exact == 1)
10800+
{
10801+
SLJIT_ASSERT(opcode == OP_STAR);
10802+
detect_partial_match(common, &backtrack->own_backtracks);
10803+
}
10804+
else
10805+
jump = JUMP(SLJIT_JUMP);
10806+
1078510807
label = LABEL();
1078610808
if (opcode == OP_UPTO)
1078710809
{
@@ -10791,7 +10813,9 @@ switch(opcode)
1079110813
compile_char1_matchingpath(common, type, cc, &backtrack->own_backtracks, FALSE);
1079210814
if (early_fail_ptr != 0)
1079310815
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), early_fail_ptr, STR_PTR, 0);
10794-
JUMPHERE(jump);
10816+
10817+
if (exact != 1)
10818+
JUMPHERE(jump);
1079510819

1079610820
detect_partial_match(common, &backtrack->own_backtracks);
1079710821
OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
@@ -10899,6 +10923,12 @@ switch(opcode)
1089910923
OP1(SLJIT_MOV, base, offset0, STR_PTR, 0);
1090010924
}
1090110925

10926+
if (exact == 1)
10927+
{
10928+
SLJIT_ASSERT(opcode == OP_STAR);
10929+
add_jump(compiler, &BACKTRACK_AS(char_iterator_backtrack)->u.backtracks, CMP(SLJIT_EQUAL, base, offset1, STR_PTR, 0));
10930+
}
10931+
1090210932
if (early_fail_ptr != 0)
1090310933
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), early_fail_ptr, STR_PTR, 0);
1090410934
}
@@ -11580,16 +11610,18 @@ struct sljit_jump *jump;
1158011610

1158111611
OP2(SLJIT_SUB, TMP1, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
1158211612
jump = CMP(SLJIT_LESS_EQUAL, TMP1, 0, TMP2, 0);
11613+
/* All newlines are single byte, or their last byte
11614+
is not equal to CHAR_NL/CHAR_CR even if UTF is enabled. */
11615+
OP1(MOV_UCHAR, SLJIT_TMP_DEST_REG, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-2));
1158311616
OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-1));
11584-
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, CHAR_NL);
11585-
OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-2));
11586-
OP_FLAGS(SLJIT_MOV, SLJIT_TMP_DEST_REG, 0, SLJIT_EQUAL);
11587-
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, CHAR_CR);
11588-
OP_FLAGS(SLJIT_AND, SLJIT_TMP_DEST_REG, 0, SLJIT_EQUAL);
11617+
OP2(SLJIT_SHL, SLJIT_TMP_DEST_REG, 0, SLJIT_TMP_DEST_REG, 0, SLJIT_IMM, 8);
11618+
OP2(SLJIT_OR, TMP1, 0, TMP1, 0, SLJIT_TMP_DEST_REG, 0);
11619+
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, CHAR_CR << 8 | CHAR_NL);
11620+
OP_FLAGS(SLJIT_MOV, TMP1, 0, SLJIT_EQUAL);
1158911621
#if PCRE2_CODE_UNIT_WIDTH == 16 || PCRE2_CODE_UNIT_WIDTH == 32
11590-
OP2(SLJIT_SHL, SLJIT_TMP_DEST_REG, 0, SLJIT_TMP_DEST_REG, 0, SLJIT_IMM, UCHAR_SHIFT);
11622+
OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, UCHAR_SHIFT);
1159111623
#endif
11592-
OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_TMP_DEST_REG, 0);
11624+
OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
1159311625
JUMPHERE(jump);
1159411626
}
1159511627

@@ -11625,7 +11657,7 @@ switch(opcode)
1162511657
}
1162611658
else
1162711659
{
11628-
if (CURRENT_AS(char_iterator_backtrack)->u.charpos.enabled)
11660+
if (CURRENT_AS(char_iterator_backtrack)->charpos_enabled)
1162911661
{
1163011662
OP1(SLJIT_MOV, STR_PTR, 0, base, offset0);
1163111663
OP1(SLJIT_MOV, TMP2, 0, base, offset1);
@@ -11646,19 +11678,39 @@ switch(opcode)
1164611678
else
1164711679
{
1164811680
OP1(SLJIT_MOV, STR_PTR, 0, base, offset0);
11649-
if (type == OP_ANYNL)
11681+
11682+
if (opcode == OP_STAR && exact == 1)
1165011683
{
11651-
OP1(SLJIT_MOV, TMP2, 0, base, offset1);
11652-
jump = CMP(SLJIT_LESS_EQUAL, STR_PTR, 0, TMP2, 0);
11653-
compile_newline_move_back(common);
11684+
if (type == OP_ANYNL)
11685+
{
11686+
OP1(SLJIT_MOV, TMP2, 0, base, offset1);
11687+
compile_newline_move_back(common);
11688+
}
11689+
11690+
move_back(common, NULL, TRUE);
11691+
jump = CMP(SLJIT_LESS_EQUAL, STR_PTR, 0, base, offset1);
1165411692
}
1165511693
else
11656-
jump = CMP(SLJIT_LESS_EQUAL, STR_PTR, 0, base, offset1);
11694+
{
11695+
if (type == OP_ANYNL)
11696+
{
11697+
OP1(SLJIT_MOV, TMP2, 0, base, offset1);
11698+
jump = CMP(SLJIT_LESS_EQUAL, STR_PTR, 0, TMP2, 0);
11699+
compile_newline_move_back(common);
11700+
}
11701+
else
11702+
jump = CMP(SLJIT_LESS_EQUAL, STR_PTR, 0, base, offset1);
11703+
11704+
move_back(common, NULL, TRUE);
11705+
}
1165711706

11658-
move_back(common, NULL, TRUE);
1165911707
OP1(SLJIT_MOV, base, offset0, STR_PTR, 0);
1166011708
JUMPTO(SLJIT_JUMP, CURRENT_AS(char_iterator_backtrack)->matchingpath);
11709+
11710+
if (opcode == OP_STAR && exact == 1)
11711+
set_jumps(CURRENT_AS(char_iterator_backtrack)->u.backtracks, LABEL());
1166111712
}
11713+
1166211714
JUMPHERE(jump);
1166311715
if (private_data_ptr == 0)
1166411716
free_stack(common, 2);

src/pcre2_jit_test.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -474,6 +474,7 @@ static struct regression_test_case regression_test_cases[] = {
474474
{ MU, A, 0, 0, "\\R+", "ab\r\n\r" },
475475
{ MU, A, 0, 0, "\\R*", "ab\r\n\r" },
476476
{ MU, A, 0, 0, "\\R*", "\r\n\r" },
477+
{ M, A, 0, 0, "\\R+\x85", "\r\n\n\r#\r\x85\n" },
477478
{ MU, A, 0, 0, "\\R{2,4}", "\r\nab\r\r" },
478479
{ MU, A, 0, 0, "\\R{2,4}", "\r\nab\n\n\n\r\r\r" },
479480
{ MU, A, 0, 0, "\\R{2,}", "\r\nab\n\n\n\r\r\r" },

0 commit comments

Comments
 (0)