Skip to content

Commit 9c297fa

Browse files
mgartnerADustyOldMuffinsriram2000nadils2k
committed
opt: collapse repeated "%" wildcards in LIKE patterns
A pattern in a `LIKE` comparison with sequential "%" wildcards is now normalized into a string with at most one sequential wildcard. For example, `c LIKE '%%abc%%%'` is now normalized into `c LIKE '%abc%'`. Fixes #80192 Release note (performance improvement): The optimizer now collapses repeated "%" wildcard characters in LIKE patterns. This may improve performance of queries with theses types of LIKE patterns. Co-authored-by: Daniel Hix <danieljacobhix@gmail.com> Co-authored-by: Sriram <sriraam2000na@gmail.com> Co-authored-by: Marcus Gartner <magartner@gmail.com> Co-authored-by: dils2k <dils.matchanov@gmail.com>
1 parent cb68f89 commit 9c297fa

File tree

5 files changed

+299
-0
lines changed

5 files changed

+299
-0
lines changed

pkg/sql/opt/norm/comp_funcs.go

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"github.com/cockroachdb/cockroach/pkg/sql/opt"
1010
"github.com/cockroachdb/cockroach/pkg/sql/opt/memo"
1111
"github.com/cockroachdb/cockroach/pkg/sql/sem/builtins/builtinsregistry"
12+
"github.com/cockroachdb/cockroach/pkg/sql/sem/eval"
1213
"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
1314
"github.com/cockroachdb/cockroach/pkg/sql/types"
1415
"github.com/cockroachdb/errors"
@@ -141,3 +142,34 @@ func (c *CustomFuncs) MakeIntersectionFunction(args memo.ScalarListExpr) opt.Sca
141142
},
142143
)
143144
}
145+
146+
// CollapseRepeatedLikePatternWildcards returns a new pattern string datum with
147+
// repeated "%" wildcards collapsed. It returns ok=false if the pattern is not a
148+
// constant string or there are no repeated characters to collapse.
149+
func (c *CustomFuncs) CollapseRepeatedLikePatternWildcards(
150+
pattern opt.ScalarExpr,
151+
) (_ opt.ScalarExpr, ok bool) {
152+
patternConst, ok := pattern.(*memo.ConstExpr)
153+
if !ok {
154+
return nil, false
155+
}
156+
switch t := patternConst.Value.(type) {
157+
case *tree.DString:
158+
orig := string(*t)
159+
collapsed := eval.CollapseLikeWildcards(orig)
160+
if orig != collapsed {
161+
return c.f.ConstructConstVal(tree.NewDString(collapsed), types.String), true
162+
}
163+
case *tree.DCollatedString:
164+
orig := t.Contents
165+
collapsed := eval.CollapseLikeWildcards(orig)
166+
if orig != collapsed {
167+
d, err := tree.NewDCollatedString(collapsed, t.Locale, &c.f.evalCtx.CollationEnv)
168+
if err != nil {
169+
panic(err)
170+
}
171+
return c.f.ConstructConstVal(d, patternConst.Typ), true
172+
}
173+
}
174+
return nil, false
175+
}

pkg/sql/opt/norm/rules/comp.opt

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -279,3 +279,20 @@ $left
279279
(Ne $left:* (False))
280280
=>
281281
$left
282+
283+
# CollapseLikePatternWildcards collapses repeated '%' wildcards into a single
284+
# '%' in the pattern of a LIKE expression.
285+
[CollapseRepeatedLikePatternWildcards, Normalize]
286+
(Like | NotLike | ILike | NotILike
287+
$input:*
288+
$pattern:* &
289+
(Let
290+
(
291+
$collapsed
292+
$ok
293+
):(CollapseRepeatedLikePatternWildcards $pattern)
294+
$ok
295+
)
296+
)
297+
=>
298+
((OpName) $input $collapsed)

pkg/sql/opt/norm/testdata/rules/comp

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1085,3 +1085,114 @@ project
10851085
│ └── columns: b:2
10861086
└── projections
10871087
└── b:2 [as="?column?":5, outer=(2)]
1088+
1089+
# --------------------------------------------------
1090+
# CollapseRepeatedLikePatternWildcards
1091+
# --------------------------------------------------
1092+
1093+
norm expect=CollapseRepeatedLikePatternWildcards
1094+
SELECT
1095+
s LIKE '%%%ab%%%',
1096+
s LIKE '%%%abc%%%def%%%',
1097+
s LIKE '%%%abc%%%def%%%ghi',
1098+
s LIKE '\\%%%abc\\%%%def\\%%%ghi',
1099+
s LIKE '%%%🐛🐛%%🏠%%%',
1100+
s NOT LIKE '%%%ab%%%',
1101+
s ILIKE '%%%ab%%%',
1102+
s NOT ILIKE '%%%ab%%%'
1103+
FROM a
1104+
----
1105+
project
1106+
├── columns: "?column?":12 "?column?":13 "?column?":14 "?column?":15 "?column?":16 "?column?":17 "?column?":18 "?column?":19
1107+
├── scan a
1108+
│ └── columns: s:4
1109+
└── projections
1110+
├── s:4 LIKE '%ab%' [as="?column?":12, outer=(4)]
1111+
├── s:4 LIKE '%abc%def%' [as="?column?":13, outer=(4)]
1112+
├── s:4 LIKE '%abc%def%ghi' [as="?column?":14, outer=(4)]
1113+
├── s:4 LIKE e'\\\\%abc\\\\%def\\\\%ghi' [as="?column?":15, outer=(4)]
1114+
├── s:4 LIKE e'%\U0001F41B\U0001F41B%\U0001F3E0%' [as="?column?":16, outer=(4)]
1115+
├── s:4 NOT LIKE '%ab%' [as="?column?":17, outer=(4)]
1116+
├── s:4 ILIKE '%ab%' [as="?column?":18, outer=(4)]
1117+
└── s:4 NOT ILIKE '%ab%' [as="?column?":19, outer=(4)]
1118+
1119+
# Collated strings can be collapsed.
1120+
norm expect=CollapseRepeatedLikePatternWildcards
1121+
SELECT
1122+
s COLLATE "en_US" LIKE '%%%ab%%%' COLLATE "en_US",
1123+
s COLLATE "en_US" LIKE '%%%abc%%%def%%%' COLLATE "en_US",
1124+
s COLLATE "en_US" LIKE '%%%abc%%%def%%%ghi' COLLATE "en_US",
1125+
s COLLATE "en_US" LIKE '\\%%%abc\\%%%def\\%%%ghi' COLLATE "en_US",
1126+
s COLLATE "en_US" LIKE '%%%🐛🐛%%🏠%%%' COLLATE "en_US",
1127+
s COLLATE "en_US" NOT LIKE '%%%ab%%%' COLLATE "en_US"
1128+
FROM a
1129+
----
1130+
project
1131+
├── columns: "?column?":12 "?column?":13 "?column?":14 "?column?":15 "?column?":16 "?column?":17
1132+
├── immutable
1133+
├── scan a
1134+
│ └── columns: s:4
1135+
└── projections
1136+
├── s:4 COLLATE en_US LIKE '%ab%' COLLATE en_US [as="?column?":12, outer=(4), immutable]
1137+
├── s:4 COLLATE en_US LIKE '%abc%def%' COLLATE en_US [as="?column?":13, outer=(4), immutable]
1138+
├── s:4 COLLATE en_US LIKE '%abc%def%ghi' COLLATE en_US [as="?column?":14, outer=(4), immutable]
1139+
├── s:4 COLLATE en_US LIKE e'\\\\%abc\\\\%def\\\\%ghi' COLLATE en_US [as="?column?":15, outer=(4), immutable]
1140+
├── s:4 COLLATE en_US LIKE e'%\U0001F41B\U0001F41B%\U0001F3E0%' COLLATE en_US [as="?column?":16, outer=(4), immutable]
1141+
└── s:4 COLLATE en_US NOT LIKE '%ab%' COLLATE en_US [as="?column?":17, outer=(4), immutable]
1142+
1143+
# Escaped wildcards should not be collapsed.
1144+
norm expect-not=CollapseRepeatedLikePatternWildcards
1145+
SELECT
1146+
s LIKE '\%%ab\%%',
1147+
s LIKE '\%%abc\%%def\%%',
1148+
s LIKE '%\%abc%\%def%\%',
1149+
s LIKE '-%%' ESCAPE '-'
1150+
FROM a
1151+
----
1152+
project
1153+
├── columns: "?column?":12 "?column?":13 "?column?":14 like_escape:15
1154+
├── immutable
1155+
├── scan a
1156+
│ └── columns: s:4
1157+
└── projections
1158+
├── s:4 LIKE e'\\%%ab\\%%' [as="?column?":12, outer=(4)]
1159+
├── s:4 LIKE e'\\%%abc\\%%def\\%%' [as="?column?":13, outer=(4)]
1160+
├── s:4 LIKE e'%\\%abc%\\%def%\\%' [as="?column?":14, outer=(4)]
1161+
└── like_escape(s:4, '-%%', '-') [as=like_escape:15, outer=(4), immutable]
1162+
1163+
# The rule should not fire when there are no repeated wildcards.
1164+
norm expect-not=CollapseRepeatedLikePatternWildcards
1165+
SELECT
1166+
s LIKE 'ab',
1167+
s LIKE '%ab%',
1168+
s LIKE '%a%b%'
1169+
FROM a
1170+
----
1171+
project
1172+
├── columns: "?column?":12 "?column?":13 "?column?":14
1173+
├── scan a
1174+
│ └── columns: s:4
1175+
└── projections
1176+
├── s:4 LIKE 'ab' [as="?column?":12, outer=(4)]
1177+
├── s:4 LIKE '%ab%' [as="?column?":13, outer=(4)]
1178+
└── s:4 LIKE '%a%b%' [as="?column?":14, outer=(4)]
1179+
1180+
norm expect-not=CollapseRepeatedLikePatternWildcards
1181+
SELECT s LIKE NULL FROM a
1182+
----
1183+
project
1184+
├── columns: "?column?":12
1185+
├── fd: ()-->(12)
1186+
├── scan a
1187+
└── projections
1188+
└── NULL [as="?column?":12]
1189+
1190+
norm expect-not=CollapseRepeatedLikePatternWildcards
1191+
SELECT s LIKE s FROM a
1192+
----
1193+
project
1194+
├── columns: "?column?":12
1195+
├── scan a
1196+
│ └── columns: s:4
1197+
└── projections
1198+
└── s:4 LIKE s:4 [as="?column?":12, outer=(4)]

pkg/sql/sem/eval/match.go

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,89 @@ func ConvertLikeToRegexp(
7878
return re, nil
7979
}
8080

81+
// CollapseLikeWildcards collapses repeated "%" wildcards in the given pattern
82+
// string, returning the collapsed string. Wildcards escaped with a backslash
83+
// "\" are not collapsed.
84+
func CollapseLikeWildcards(pattern string) string {
85+
// NOTE: It is safe to iterate byte-wise forward and backward because no
86+
// multibyte UTF-8 characters contain ASCII bytes.
87+
88+
// Trim leading and trailing repeated "%" wildcards first.
89+
start := 0
90+
for start < len(pattern) && pattern[start] == '%' {
91+
start++
92+
}
93+
if start > 1 {
94+
pattern = pattern[start-1:]
95+
}
96+
97+
end := len(pattern)
98+
for end > 0 && pattern[end-1] == '%' {
99+
end--
100+
}
101+
// An odd number of preceding backslashes means that the first "%" is
102+
// escaped.
103+
escapeCount := 0
104+
for i := end - 1; i >= 0 && pattern[i] == '\\'; i-- {
105+
escapeCount++
106+
}
107+
if escaped := escapeCount&1 == 1; escaped {
108+
end++
109+
}
110+
if end < len(pattern) {
111+
pattern = pattern[:end+1]
112+
}
113+
114+
// Next, we successively find sections without repeated "%" and copy them
115+
// into sb. start and end form the bounds of the current section, with end
116+
// being exclusive.
117+
var sb strings.Builder
118+
start = 0
119+
end = 0
120+
if end < len(pattern) && pattern[end] == '%' {
121+
// Skip over the first character if it matches "%".
122+
end = 1
123+
}
124+
for start < len(pattern) {
125+
// Advance end to the next unescaped '%'.
126+
escaped := false
127+
for end < len(pattern) && (pattern[end] != '%' || escaped) {
128+
escaped = pattern[end] == '\\' && !escaped
129+
end++
130+
}
131+
132+
// Increment end to include the "%" wildcard at the end of the section,
133+
// if there is one.
134+
if end < len(pattern) {
135+
end++
136+
}
137+
138+
// If the entire pattern, after trimming the prefix and suffix, is
139+
// without duplicate '%', return it as-is.
140+
if end == len(pattern) && start == 0 {
141+
return pattern
142+
}
143+
144+
if sb.Cap() == 0 {
145+
sb.Grow(len(pattern))
146+
}
147+
sb.WriteString(pattern[start:end])
148+
149+
// Find the start of the next section, skipping over duplicate "%"
150+
// wildcards.
151+
start = end
152+
for start < len(pattern) {
153+
if pattern[start] != '%' {
154+
break
155+
}
156+
start++
157+
}
158+
end = start
159+
}
160+
161+
return sb.String()
162+
}
163+
81164
func matchLike(ctx *Context, left, right tree.Datum, caseInsensitive bool) (tree.Datum, error) {
82165
if left == tree.DNull || right == tree.DNull {
83166
return tree.DNull, nil

pkg/sql/sem/eval/match_test.go

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,62 @@ import (
1010
"testing"
1111
)
1212

13+
func TestCollapseLikeWildcards(t *testing.T) {
14+
for _, tc := range []struct {
15+
pattern, expected string
16+
}{
17+
{"", ""},
18+
{"a", "a"},
19+
{"abc", "abc"},
20+
{"%", "%"},
21+
{"%%", "%"},
22+
{"%%%%%%%%%", "%"},
23+
{"%%a", "%a"},
24+
{"%%abc", "%abc"},
25+
{"a%%", "a%"},
26+
{"abc%%", "abc%"},
27+
{"%%%a%%%", "%a%"},
28+
{"%%%abc%%%", "%abc%"},
29+
{"a%%%b", "a%b"},
30+
{"%%a%%%b%%", "%a%b%"},
31+
{"%%%🐛🐛%%🏠%%%", "%🐛🐛%🏠%"},
32+
{"\\%%", "\\%%"},
33+
{"\\%\\%", "\\%\\%"},
34+
{"\\%%\\%%", "\\%%\\%%"},
35+
{"\\%%%\\%%%", "\\%%\\%%"},
36+
{"\\\\%%", "\\\\%"},
37+
{"\\\\\\%%", "\\\\\\%%"},
38+
{"abc\\\\\\%%", "abc\\\\\\%%"},
39+
{"\\\\\\%%abc", "\\\\\\%%abc"},
40+
{"a\\%%%%b", "a\\%%b"},
41+
{"a\\%%%\\\\%%%b", "a\\%%\\\\%b"},
42+
{"a\\\\\\%%\\\\\\\\%%%b", "a\\\\\\%%\\\\\\\\%b"},
43+
{"\\%%%abc%%\\%", "\\%%abc%\\%"},
44+
{"%%\\%a%\\%%b\\%%%", "%\\%a%\\%%b\\%%"},
45+
{"\\\\%%%a%%\\\\%%b\\\\%%%", "\\\\%a%\\\\%b\\\\%"},
46+
{"%%\\%🐛🐛\\\\%%🏠\\%\\\\%%", "%\\%🐛🐛\\\\%🏠\\%\\\\%"},
47+
} {
48+
if res := CollapseLikeWildcards(tc.pattern); res != tc.expected {
49+
t.Errorf("expected %q to collapse to %q, got %q", tc.pattern, tc.expected, res)
50+
}
51+
}
52+
}
53+
54+
func BenchmarkCollapseLikeWildcards(b *testing.B) {
55+
for _, pattern := range []string{
56+
"somepatternwithoutwildcards",
57+
"%%%%%apatternwithbeginningandtrailingwildcards%%%%%",
58+
"%%%%%pattern%%%with%%%morewildcards%%%%%",
59+
"%%%%%🐛🐛🐛🐛🐛%%%%%🏠🏠🏠🏠🏠🏠%%%%%",
60+
} {
61+
b.Run(pattern, func(b *testing.B) {
62+
for b.Loop() {
63+
_ = CollapseLikeWildcards(pattern)
64+
}
65+
})
66+
}
67+
}
68+
1369
// TestOptimizedLike checks that for certain patterns we are using optimized
1470
// evaluation of LIKE and ILIKE rather than regex-powered evaluation.
1571
func TestOptimizedLike(t *testing.T) {

0 commit comments

Comments
 (0)