Skip to content

Commit 921ea56

Browse files
committed
test loop fusion implementation
1 parent 6dc45f6 commit 921ea56

File tree

1 file changed

+376
-0
lines changed

1 file changed

+376
-0
lines changed

test/test_loop_fusion.py

Lines changed: 376 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,376 @@
1+
__copyright__ = "Copyright (C) 2021 Kaushik Kulkarni"
2+
3+
__license__ = """
4+
Permission is hereby granted, free of charge, to any person obtaining a copy
5+
of this software and associated documentation files (the "Software"), to deal
6+
in the Software without restriction, including without limitation the rights
7+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8+
copies of the Software, and to permit persons to whom the Software is
9+
furnished to do so, subject to the following conditions:
10+
11+
The above copyright notice and this permission notice shall be included in
12+
all copies or substantial portions of the Software.
13+
14+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20+
THE SOFTWARE.
21+
"""
22+
23+
import sys
24+
import numpy as np
25+
import loopy as lp
26+
import pyopencl as cl
27+
import pyopencl.clmath # noqa
28+
import pyopencl.clrandom # noqa
29+
30+
import logging
31+
logger = logging.getLogger(__name__)
32+
33+
try:
34+
import faulthandler
35+
except ImportError:
36+
pass
37+
else:
38+
faulthandler.enable()
39+
40+
from pyopencl.tools import pytest_generate_tests_for_pyopencl \
41+
as pytest_generate_tests
42+
43+
from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 # noqa
44+
45+
__all__ = [
46+
"pytest_generate_tests",
47+
"cl" # "cl.create_some_context"
48+
]
49+
50+
51+
def test_loop_fusion_vanilla(ctx_factory):
52+
ctx = ctx_factory()
53+
54+
knl = lp.make_kernel(
55+
"{[i0, i1, j0, j1]: 0 <= i0, i1, j0, j1 < 10}",
56+
"""
57+
a[i0] = 1
58+
b[i1, j0] = 2 {id=write_b}
59+
c[j1] = 3 {id=write_c}
60+
""")
61+
ref_knl = knl
62+
63+
fused_chunks = lp.get_kennedy_unweighted_fusion_candidates(knl["loopy_kernel"],
64+
frozenset(["j0", "j1"]))
65+
66+
knl = knl.with_kernel(lp.rename_inames_in_batch(knl["loopy_kernel"],
67+
fused_chunks))
68+
assert len(ref_knl["loopy_kernel"].all_inames()) == 4
69+
assert len(knl["loopy_kernel"].all_inames()) == 3
70+
assert len(knl["loopy_kernel"].id_to_insn["write_b"].within_inames
71+
& knl["loopy_kernel"].id_to_insn["write_c"].within_inames) == 1
72+
73+
lp.auto_test_vs_ref(ref_knl, ctx, knl)
74+
75+
76+
def test_loop_fusion_outer_iname_preventing_fusion(ctx_factory):
77+
ctx = ctx_factory()
78+
79+
knl = lp.make_kernel(
80+
"{[i0, j0, j1]: 0 <= i0, j0, j1 < 10}",
81+
"""
82+
a[i0] = 1
83+
b[i0, j0] = 2 {id=write_b}
84+
c[j1] = 3 {id=write_c}
85+
""")
86+
ref_knl = knl
87+
88+
fused_chunks = lp.get_kennedy_unweighted_fusion_candidates(knl["loopy_kernel"],
89+
frozenset(["j0", "j1"]))
90+
91+
knl = knl.with_kernel(lp.rename_inames_in_batch(knl["loopy_kernel"],
92+
fused_chunks))
93+
94+
assert len(knl["loopy_kernel"].all_inames()) == 3
95+
assert len(knl["loopy_kernel"].all_inames()) == 3
96+
assert len(knl["loopy_kernel"].id_to_insn["write_b"].within_inames
97+
& knl["loopy_kernel"].id_to_insn["write_c"].within_inames) == 0
98+
99+
lp.auto_test_vs_ref(ref_knl, ctx, knl)
100+
101+
102+
def test_loop_fusion_with_loop_independent_deps(ctx_factory):
103+
ctx = ctx_factory()
104+
105+
knl = lp.make_kernel(
106+
"{[j0, j1]: 0 <= j0, j1 < 10}",
107+
"""
108+
a[j0] = 1
109+
b[j1] = 2 * a[j1]
110+
""", seq_dependencies=True)
111+
112+
ref_knl = knl
113+
114+
fused_chunks = lp.get_kennedy_unweighted_fusion_candidates(knl["loopy_kernel"],
115+
frozenset(["j0", "j1"]))
116+
117+
knl = knl.with_kernel(lp.rename_inames_in_batch(knl["loopy_kernel"],
118+
fused_chunks))
119+
120+
assert len(ref_knl["loopy_kernel"].all_inames()) == 2
121+
assert len(knl["loopy_kernel"].all_inames()) == 1
122+
123+
lp.auto_test_vs_ref(ref_knl, ctx, knl)
124+
125+
126+
def test_loop_fusion_constrained_by_outer_loop_deps(ctx_factory):
127+
ctx = ctx_factory()
128+
129+
knl = lp.make_kernel(
130+
"{[j0, j1]: 0 <= j0, j1 < 10}",
131+
"""
132+
a[j0] = 1 {id=write_a}
133+
b = 2 {id=write_b}
134+
c[j1] = 2 * a[j1] {id=write_c}
135+
""", seq_dependencies=True)
136+
137+
ref_knl = knl
138+
139+
fused_chunks = lp.get_kennedy_unweighted_fusion_candidates(knl["loopy_kernel"],
140+
frozenset(["j0", "j1"]))
141+
142+
knl = knl.with_kernel(lp.rename_inames_in_batch(knl["loopy_kernel"],
143+
fused_chunks))
144+
145+
assert len(ref_knl["loopy_kernel"].all_inames()) == 2
146+
assert len(knl["loopy_kernel"].all_inames()) == 2
147+
assert len(knl["loopy_kernel"].id_to_insn["write_a"].within_inames
148+
& knl["loopy_kernel"].id_to_insn["write_c"].within_inames) == 0
149+
150+
lp.auto_test_vs_ref(ref_knl, ctx, knl)
151+
152+
153+
def test_loop_fusion_with_loop_carried_deps1(ctx_factory):
154+
155+
ctx = ctx_factory()
156+
knl = lp.make_kernel(
157+
"{[i0, i1]: 1<=i0, i1<10}",
158+
"""
159+
x[i0] = i0 {id=first_write}
160+
x[i1-1] = i1 ** 2 {id=second_write}
161+
""",
162+
seq_dependencies=True)
163+
164+
ref_knl = knl
165+
166+
fused_chunks = lp.get_kennedy_unweighted_fusion_candidates(knl["loopy_kernel"],
167+
frozenset(["i0",
168+
"i1"]))
169+
170+
knl = knl.with_kernel(lp.rename_inames_in_batch(knl["loopy_kernel"],
171+
fused_chunks))
172+
173+
assert len(ref_knl["loopy_kernel"].all_inames()) == 2
174+
assert len(knl["loopy_kernel"].all_inames()) == 1
175+
assert len(knl["loopy_kernel"].id_to_insn["first_write"].within_inames
176+
& knl["loopy_kernel"].id_to_insn["second_write"].within_inames) == 1
177+
178+
lp.auto_test_vs_ref(ref_knl, ctx, knl)
179+
180+
181+
def test_loop_fusion_with_loop_carried_deps2(ctx_factory):
182+
ctx = ctx_factory()
183+
knl = lp.make_kernel(
184+
"{[i0, i1]: 1<=i0, i1<10}",
185+
"""
186+
x[i0-1] = i0 {id=first_write}
187+
x[i1] = i1 ** 2 {id=second_write}
188+
""",
189+
seq_dependencies=True)
190+
191+
ref_knl = knl
192+
193+
fused_chunks = lp.get_kennedy_unweighted_fusion_candidates(knl["loopy_kernel"],
194+
frozenset(["i0",
195+
"i1"]))
196+
197+
knl = knl.with_kernel(lp.rename_inames_in_batch(knl["loopy_kernel"],
198+
fused_chunks))
199+
200+
assert len(ref_knl["loopy_kernel"].all_inames()) == 2
201+
assert len(knl["loopy_kernel"].all_inames()) == 2
202+
assert len(knl["loopy_kernel"].id_to_insn["first_write"].within_inames
203+
& knl["loopy_kernel"].id_to_insn["second_write"].within_inames) == 0
204+
205+
lp.auto_test_vs_ref(ref_knl, ctx, knl)
206+
207+
208+
def test_loop_fusion_with_indirection(ctx_factory):
209+
ctx = ctx_factory()
210+
map_ = np.random.permutation(10)
211+
cq = cl.CommandQueue(ctx)
212+
213+
knl = lp.make_kernel(
214+
"{[i0, i1]: 0<=i0, i1<10}",
215+
"""
216+
x[i0] = i0 {id=first_write}
217+
x[map[i1]] = i1 ** 2 {id=second_write}
218+
""",
219+
seq_dependencies=True)
220+
221+
ref_knl = knl
222+
223+
fused_chunks = lp.get_kennedy_unweighted_fusion_candidates(knl["loopy_kernel"],
224+
frozenset(["i0",
225+
"i1"]))
226+
227+
knl = knl.with_kernel(lp.rename_inames_in_batch(knl["loopy_kernel"],
228+
fused_chunks))
229+
230+
assert len(ref_knl["loopy_kernel"].all_inames()) == 2
231+
assert len(knl["loopy_kernel"].all_inames()) == 2
232+
assert len(knl["loopy_kernel"].id_to_insn["first_write"].within_inames
233+
& knl["loopy_kernel"].id_to_insn["second_write"].within_inames) == 0
234+
235+
_, (out1,) = ref_knl(cq, map=map_)
236+
_, (out2,) = knl(cq, map=map_)
237+
np.testing.assert_allclose(out1, out2)
238+
239+
240+
def test_loop_fusion_with_induced_dependencies_from_sibling_nests(ctx_factory):
241+
ctx = ctx_factory()
242+
t_unit = lp.make_kernel(
243+
"{[i0, j, i1, i2]: 0<=i0, j, i1, i2<10}",
244+
"""
245+
<> tmp0[i0] = i0
246+
<> tmp1[j] = tmp0[j]
247+
<> tmp2[j] = j
248+
out1[i1] = tmp2[i1]
249+
out2[i2] = 2 * tmp1[i2]
250+
""")
251+
ref_t_unit = t_unit
252+
knl = t_unit.default_entrypoint
253+
knl = lp.rename_inames_in_batch(
254+
knl,
255+
lp.get_kennedy_unweighted_fusion_candidates(
256+
knl, frozenset(["i0", "i1"])))
257+
t_unit = t_unit.with_kernel(knl)
258+
259+
# 'i1', 'i2' should not be fused. If fused that would lead to an
260+
# unshcedulable kernel. Making sure that the kernel 'runs' suffices that
261+
# the transformation was successful.
262+
lp.auto_test_vs_ref(ref_t_unit, ctx, t_unit)
263+
264+
265+
def test_loop_fusion_on_reduction_inames(ctx_factory):
266+
ctx = ctx_factory()
267+
268+
t_unit = lp.make_kernel(
269+
"{[i, j0, j1, j2]: 0<=i, j0, j1, j2<10}",
270+
"""
271+
y0[i] = sum(j0, sum([j1], 2*A[i, j0, j1]))
272+
y1[i] = sum(j0, sum([j2], 3*A[i, j0, j2]))
273+
""", [lp.GlobalArg("A",
274+
dtype=np.float64,
275+
shape=lp.auto), ...])
276+
ref_t_unit = t_unit
277+
knl = t_unit.default_entrypoint
278+
knl = lp.rename_inames_in_batch(
279+
knl,
280+
lp.get_kennedy_unweighted_fusion_candidates(
281+
knl, frozenset(["j1", "j2"])))
282+
assert (knl.id_to_insn["insn"].reduction_inames()
283+
== knl.id_to_insn["insn_0"].reduction_inames())
284+
285+
t_unit = t_unit.with_kernel(knl)
286+
lp.auto_test_vs_ref(ref_t_unit, ctx, t_unit)
287+
288+
289+
def test_loop_fusion_on_reduction_inames_with_depth_mismatch(ctx_factory):
290+
ctx = ctx_factory()
291+
292+
t_unit = lp.make_kernel(
293+
"{[i, j0, j1, j2, j3]: 0<=i, j0, j1, j2, j3<10}",
294+
"""
295+
y0[i] = sum(j0, sum([j1], 2*A[i, j0, j1]))
296+
y1[i] = sum(j2, sum([j3], 3*A[i, j3, j2]))
297+
""", [lp.GlobalArg("A",
298+
dtype=np.float64,
299+
shape=lp.auto),
300+
...])
301+
ref_t_unit = t_unit
302+
knl = t_unit.default_entrypoint
303+
knl = lp.rename_inames_in_batch(
304+
knl,
305+
lp.get_kennedy_unweighted_fusion_candidates(
306+
knl, frozenset(["j1", "j3"])))
307+
308+
# cannot fuse 'j1', 'j3' because they are not nested within the same outer
309+
# inames.
310+
assert (knl.id_to_insn["insn"].reduction_inames()
311+
!= knl.id_to_insn["insn_0"].reduction_inames())
312+
313+
t_unit = t_unit.with_kernel(knl)
314+
lp.auto_test_vs_ref(ref_t_unit, ctx, t_unit)
315+
316+
317+
def test_loop_fusion_on_outer_reduction_inames(ctx_factory):
318+
ctx = ctx_factory()
319+
320+
t_unit = lp.make_kernel(
321+
"{[i, j0, j1, j2, j3]: 0<=i, j0, j1, j2, j3<10}",
322+
"""
323+
y0[i] = sum(j0, sum([j1], 2*A[i, j0, j1]))
324+
y1[i] = sum(j2, sum([j3], 3*A[i, j3, j2]))
325+
""", [lp.GlobalArg("A",
326+
dtype=np.float64,
327+
shape=lp.auto),
328+
...])
329+
ref_t_unit = t_unit
330+
knl = t_unit.default_entrypoint
331+
knl = lp.rename_inames_in_batch(
332+
knl,
333+
lp.get_kennedy_unweighted_fusion_candidates(
334+
knl, frozenset(["j0", "j2"])))
335+
336+
assert len(knl.id_to_insn["insn"].reduction_inames()
337+
& knl.id_to_insn["insn_0"].reduction_inames()) == 1
338+
339+
t_unit = t_unit.with_kernel(knl)
340+
lp.auto_test_vs_ref(ref_t_unit, ctx, t_unit)
341+
342+
343+
def test_loop_fusion_reduction_inames_simple(ctx_factory):
344+
ctx = ctx_factory()
345+
346+
t_unit = lp.make_kernel(
347+
"{[i, j0, j1]: 0<=i, j0, j1<10}",
348+
"""
349+
y0[i] = sum(j0, 2*A[i, j0])
350+
y1[i] = sum(j1, 3*A[i, j1])
351+
""", [lp.GlobalArg("A",
352+
dtype=np.float64,
353+
shape=lp.auto),
354+
...])
355+
ref_t_unit = t_unit
356+
knl = t_unit.default_entrypoint
357+
knl = lp.rename_inames_in_batch(
358+
knl,
359+
lp.get_kennedy_unweighted_fusion_candidates(
360+
knl, frozenset(["j0", "j1"])))
361+
362+
assert (knl.id_to_insn["insn"].reduction_inames()
363+
== knl.id_to_insn["insn_0"].reduction_inames())
364+
365+
t_unit = t_unit.with_kernel(knl)
366+
lp.auto_test_vs_ref(ref_t_unit, ctx, t_unit)
367+
368+
369+
if __name__ == "__main__":
370+
if len(sys.argv) > 1:
371+
exec(sys.argv[1])
372+
else:
373+
from pytest import main
374+
main([__file__])
375+
376+
# vim: fdm=marker

0 commit comments

Comments
 (0)