Skip to content

Commit 7c6f709

Browse files
authored
Merge pull request #187 from KernelTuner/refactor_interface
Refactor interfaces
2 parents a60e060 + 881042a commit 7c6f709

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

61 files changed

+1608
-1029
lines changed

doc/source/design.rst

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -98,33 +98,33 @@ kernel_tuner.core.DeviceInterface
9898
:special-members: __init__
9999
:members:
100100

101-
kernel_tuner.pycuda.PyCudaFunctions
101+
kernel_tuner.backends.pycuda.PyCudaFunctions
102102
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
103-
.. autoclass:: kernel_tuner.pycuda.PyCudaFunctions
103+
.. autoclass:: kernel_tuner.backends.pycuda.PyCudaFunctions
104104
:special-members: __init__
105105
:members:
106106

107-
kernel_tuner.cupy.CupyFunctions
107+
kernel_tuner.backends.cupy.CupyFunctions
108108
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
109-
.. autoclass:: kernel_tuner.cupy.CupyFunctions
109+
.. autoclass:: kernel_tuner.backends.cupy.CupyFunctions
110110
:special-members: __init__
111111
:members:
112112

113-
kernel_tuner.nvcuda.CudaFunctions
113+
kernel_tuner.backends.nvcuda.CudaFunctions
114114
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
115-
.. autoclass:: kernel_tuner.nvcuda.CudaFunctions
115+
.. autoclass:: kernel_tuner.backends.nvcuda.CudaFunctions
116116
:special-members: __init__
117117
:members:
118118

119-
kernel_tuner.opencl.OpenCLFunctions
119+
kernel_tuner.backends.opencl.OpenCLFunctions
120120
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
121-
.. autoclass:: kernel_tuner.opencl.OpenCLFunctions
121+
.. autoclass:: kernel_tuner.backends.opencl.OpenCLFunctions
122122
:special-members: __init__
123123
:members:
124124

125-
kernel_tuner.c.CFunctions
125+
kernel_tuner.backends.c.CFunctions
126126
~~~~~~~~~~~~~~~~~~~~~~~~~
127-
.. autoclass:: kernel_tuner.c.CFunctions
127+
.. autoclass:: kernel_tuner.backends.c.CFunctions
128128
:special-members: __init__
129129
:members:
130130

examples/cuda/convolution_correct.py

Lines changed: 49 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -26,66 +26,88 @@
2626
import kernel_tuner
2727
from collections import OrderedDict
2828

29+
2930
def tune():
30-
with open('convolution.cu', 'r') as f:
31+
with open("convolution.cu", "r") as f:
3132
kernel_string = f.read()
3233

3334
filter_size = (17, 17)
3435
problem_size = (4096, 4096)
3536
size = numpy.prod(problem_size)
36-
border_size = (filter_size[0]//2*2, filter_size[1]//2*2)
37-
input_size = ((problem_size[0]+border_size[0]) * (problem_size[1]+border_size[1]))
37+
border_size = (filter_size[0] // 2 * 2, filter_size[1] // 2 * 2)
38+
input_size = (problem_size[0] + border_size[0]) * (problem_size[1] + border_size[1])
3839

3940
output = numpy.zeros(size).astype(numpy.float32)
4041
input = numpy.random.randn(input_size).astype(numpy.float32)
4142

42-
filter = numpy.random.randn(filter_size[0]*filter_size[1]).astype(numpy.float32)
43-
cmem_args= {'d_filter': filter }
43+
filter = numpy.random.randn(filter_size[0] * filter_size[1]).astype(numpy.float32)
44+
cmem_args = {"d_filter": filter}
4445

4546
args = [output, input, filter]
4647
tune_params = OrderedDict()
4748
tune_params["filter_width"] = [filter_size[0]]
4849
tune_params["filter_height"] = [filter_size[1]]
4950

50-
#tune_params["block_size_x"] = [16*i for i in range(1,3)]
51-
tune_params["block_size_x"] = [16*i for i in range(1,9)]
52-
#tune_params["block_size_y"] = [2**i for i in range(1,5)]
53-
tune_params["block_size_y"] = [2**i for i in range(1,6)]
51+
# tune_params["block_size_x"] = [16*i for i in range(1,3)]
52+
tune_params["block_size_x"] = [16 * i for i in range(1, 9)]
53+
# tune_params["block_size_y"] = [2**i for i in range(1,5)]
54+
tune_params["block_size_y"] = [2**i for i in range(1, 6)]
5455

5556
tune_params["tile_size_x"] = [2**i for i in range(3)]
5657
tune_params["tile_size_y"] = [2**i for i in range(3)]
5758

58-
tune_params["use_padding"] = [0,1] #toggle the insertion of padding in shared memory
59-
tune_params["read_only"] = [0,1] #toggle using the read-only cache
59+
tune_params["use_padding"] = [
60+
0,
61+
1,
62+
] # toggle the insertion of padding in shared memory
63+
tune_params["read_only"] = [0, 1] # toggle using the read-only cache
6064

6165
grid_div_x = ["block_size_x", "tile_size_x"]
6266
grid_div_y = ["block_size_y", "tile_size_y"]
6367

64-
#compute the answer using a naive kernel
65-
params = { "block_size_x": 16, "block_size_y": 16}
68+
# compute the answer using a naive kernel
69+
params = {"block_size_x": 16, "block_size_y": 16}
6670
tune_params["filter_width"] = [filter_size[0]]
6771
tune_params["filter_height"] = [filter_size[1]]
68-
results = kernel_tuner.run_kernel("convolution_naive", kernel_string,
69-
problem_size, args, params,
70-
grid_div_y=["block_size_y"], grid_div_x=["block_size_x"], lang='cupy')
71-
72-
#set non-output fields to None
72+
results = kernel_tuner.run_kernel(
73+
"convolution_naive",
74+
kernel_string,
75+
problem_size,
76+
args,
77+
params,
78+
grid_div_y=["block_size_y"],
79+
grid_div_x=["block_size_x"],
80+
lang="cupy",
81+
)
82+
83+
# set non-output fields to None
7384
answer = [results[0], None, None]
7485

75-
#start kernel tuning with correctness verification
76-
return kernel_tuner.tune_kernel("convolution_kernel", kernel_string,
77-
problem_size, args, tune_params,
78-
grid_div_y=grid_div_y, grid_div_x=grid_div_x, verbose=True, cmem_args=cmem_args, answer=answer, lang='cupy')
86+
# start kernel tuning with correctness verification
87+
return kernel_tuner.tune_kernel(
88+
"convolution_kernel",
89+
kernel_string,
90+
problem_size,
91+
args,
92+
tune_params,
93+
grid_div_y=grid_div_y,
94+
grid_div_x=grid_div_x,
95+
verbose=True,
96+
cmem_args=cmem_args,
97+
answer=answer,
98+
lang="cupy",
99+
)
79100

80101

81102
if __name__ == "__main__":
82103
import time
83-
s1 = time.time()*1000
104+
105+
s1 = time.time() * 1000
84106
results = tune()
85107

86-
e1 = time.time()*1000
87-
print("\n Actualy time used:", e1-s1)
108+
e1 = time.time() * 1000
109+
print("\n Actual time used:", e1 - s1)
88110
import json
89-
with open("convolution_RTX_2070.json", 'w') as fp:
90-
json.dump(results, fp)
91111

112+
with open("convolution_RTX_2070.json", "w") as fp:
113+
json.dump(results, fp)

examples/cuda/vector_add_observers.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
import numpy
88
from kernel_tuner import tune_kernel
9-
from kernel_tuner.nvml import NVMLObserver
9+
from kernel_tuner.observers.nvml import NVMLObserver
1010

1111
def tune():
1212

examples/opencl/vector_add_observers.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
import numpy
88
from kernel_tuner import tune_kernel
9-
from kernel_tuner.nvml import NVMLObserver
9+
from kernel_tuner.observers.nvml import NVMLObserver
1010

1111
def tune():
1212

kernel_tuner/backends/__init__.py

Whitespace-only changes.

kernel_tuner/backends/backend.py

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
"""This module contains the interface of all kernel_tuner backends"""
2+
from __future__ import print_function
3+
4+
from abc import ABC, abstractmethod
5+
6+
7+
class Backend(ABC):
8+
"""Base class for kernel_tuner backends"""
9+
10+
@abstractmethod
11+
def ready_argument_list(self, arguments):
12+
"""This method must implement the allocation of the arguments on device memory."""
13+
pass
14+
15+
@abstractmethod
16+
def compile(self, kernel_instance):
17+
"""This method must implement the compilation of a kernel into a callable function."""
18+
pass
19+
20+
@abstractmethod
21+
def start_event(self):
22+
"""This method must implement the recording of the start of a measurement."""
23+
pass
24+
25+
@abstractmethod
26+
def stop_event(self):
27+
"""This method must implement the recording of the end of a measurement."""
28+
pass
29+
30+
@abstractmethod
31+
def kernel_finished(self):
32+
"""This method must implement a check that returns True if the kernel has finished, False otherwise."""
33+
pass
34+
35+
@abstractmethod
36+
def synchronize(self):
37+
"""This method must implement a barrier that halts execution until device has finished its tasks."""
38+
pass
39+
40+
@abstractmethod
41+
def run_kernel(self, func, gpu_args, threads, grid, stream):
42+
"""This method must implement the execution of the kernel on the device."""
43+
pass
44+
45+
@abstractmethod
46+
def memset(self, allocation, value, size):
47+
"""This method must implement setting the memory to a value on the device."""
48+
pass
49+
50+
@abstractmethod
51+
def memcpy_dtoh(self, dest, src):
52+
"""This method must implement a device to host copy."""
53+
pass
54+
55+
@abstractmethod
56+
def memcpy_htod(self, dest, src):
57+
"""This method must implement a host to device copy."""
58+
pass
59+
60+
61+
class GPUBackend(Backend):
62+
"""Base class for GPU backends"""
63+
64+
@abstractmethod
65+
def __init__(self, device, iterations, compiler_options, observers):
66+
pass
67+
68+
@abstractmethod
69+
def copy_constant_memory_args(self, cmem_args):
70+
"""This method must implement the allocation and copy of constant memory to the GPU."""
71+
pass
72+
73+
@abstractmethod
74+
def copy_shared_memory_args(self, smem_args):
75+
"""This method must implement the dynamic allocation of shared memory on the GPU."""
76+
pass
77+
78+
@abstractmethod
79+
def copy_texture_memory_args(self, texmem_args):
80+
"""This method must implement the allocation and copy of texture memory to the GPU."""
81+
pass
82+
83+
84+
class CompilerBackend(Backend):
85+
"""Base class for compiler backends"""
86+
87+
@abstractmethod
88+
def __init__(self, iterations, compiler_options, compiler):
89+
pass

0 commit comments

Comments
 (0)