Skip to content

Commit c929405

Browse files
mapreduce: reinstate and fix block optimization (#2880)
Co-authored-by: Tim Besard <tim.besard@gmail.com>
1 parent b9f7c41 commit c929405

File tree

2 files changed

+56
-8
lines changed

2 files changed

+56
-8
lines changed

src/mapreduce.jl

Lines changed: 55 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
## COV_EXCL_START
22

33
# TODO
4-
# - serial version for lower latency
54
# - block-stride loop to delay need for second kernel launch
65

76
# Reduce a value across a warp
@@ -134,7 +133,7 @@ function partial_mapreduce_grid(f, op, neutral, Rreduce, Rother, shuffle, R::Abs
134133
return
135134
end
136135

137-
function big_mapreduce_kernel(f, op, neutral, Rreduce, Rother, R, As)
136+
function serial_mapreduce_kernel(f, op, neutral, Rreduce, Rother, R, As)
138137
grid_idx = threadIdx().x + (blockIdx().x - 1i32) * blockDim().x
139138
@inbounds if grid_idx <= length(Rother)
140139
Iother = Rother[grid_idx]
@@ -160,7 +159,7 @@ end
160159
## COV_EXCL_STOP
161160

162161
# factored out for use in tests
163-
function big_mapreduce_threshold(dev)
162+
function serial_mapreduce_threshold(dev)
164163
max_concurrency = attribute(dev, DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK) *
165164
attribute(dev, DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT)
166165
return max_concurrency
@@ -197,9 +196,9 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
197196
@assert length(Rother) > 0
198197

199198
# If `Rother` is large enough, then a naive loop is more efficient than partial reductions.
200-
if length(Rother) >= big_mapreduce_threshold(dev)
199+
if length(Rother) >= serial_mapreduce_threshold(dev)
201200
args = (f, op, init, Rreduce, Rother, R, A)
202-
kernel = @cuda launch=false big_mapreduce_kernel(args...)
201+
kernel = @cuda launch=false serial_mapreduce_kernel(args...)
203202
kernel_config = launch_configuration(kernel.fun)
204203
threads = kernel_config.threads
205204
blocks = cld(length(Rother), threads)
@@ -232,13 +231,62 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
232231
reduce_threads = compute_threads(kernel_config.threads)
233232
reduce_shmem = compute_shmem(reduce_threads)
234233

234+
# how many blocks should we launch?
235+
#
236+
# even though we can always reduce each slice in a single thread block, that may not be
237+
# optimal as it might not saturate the GPU. we already launch some blocks to process
238+
# independent dimensions in parallel; pad that number to ensure full occupancy.
239+
other_blocks = length(Rother)
240+
reduce_blocks = if other_blocks >= kernel_config.blocks
241+
1
242+
else
243+
min(cld(length(Rreduce), reduce_threads), # how many we need at most
244+
cld(kernel_config.blocks, other_blocks)) # maximize occupancy
245+
end
246+
235247
# determine the launch configuration
236248
threads = reduce_threads
237249
shmem = reduce_shmem
238-
blocks = length(Rother)
250+
blocks = reduce_blocks*other_blocks
239251

240252
# perform the actual reduction
241-
kernel(f, op, init, Rreduce, Rother, Val(shuffle), R, A; threads, blocks, shmem)
253+
if reduce_blocks == 1
254+
# we can cover the dimensions to reduce using a single block
255+
kernel(f, op, init, Rreduce, Rother, Val(shuffle), R, A; threads, blocks, shmem)
256+
else
257+
# TODO: provide a version that atomically reduces from different blocks
258+
259+
# temporary empty array whose type will match the final partial array
260+
partial = similar(R, ntuple(_ -> 0, Val(ndims(R)+1)))
261+
262+
# NOTE: we can't use the previously-compiled kernel, or its launch configuration,
263+
# since the type of `partial` might not match the original output container
264+
# (e.g. if that was a view).
265+
partial_kernel = @cuda launch=false partial_mapreduce_grid(f, op, init, Rreduce, Rother, Val(shuffle), partial, A)
266+
partial_kernel_config = launch_configuration(partial_kernel.fun; shmem=compute_shmemcompute_threads)
267+
partial_reduce_threads = compute_threads(partial_kernel_config.threads)
268+
partial_reduce_shmem = compute_shmem(partial_reduce_threads)
269+
partial_reduce_blocks = if other_blocks >= partial_kernel_config.blocks
270+
1
271+
else
272+
min(cld(length(Rreduce), partial_reduce_threads),
273+
cld(partial_kernel_config.blocks, other_blocks))
274+
end
275+
partial_threads = partial_reduce_threads
276+
partial_shmem = partial_reduce_shmem
277+
partial_blocks = partial_reduce_blocks*other_blocks
278+
279+
partial = similar(R, (size(R)..., partial_blocks))
280+
if init === nothing
281+
# without an explicit initializer we need to copy from the output container
282+
partial .= R
283+
end
284+
285+
partial_kernel(f, op, init, Rreduce, Rother, Val(shuffle), partial, A;
286+
threads=partial_threads, blocks=partial_blocks, shmem=partial_shmem)
287+
288+
GPUArrays.mapreducedim!(identity, op, R, partial; init)
289+
end
242290

243291
return R
244292
end

test/base/array.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -718,7 +718,7 @@ end
718718
@testset "large map reduce" begin
719719
dev = device()
720720

721-
big_size = CUDA.big_mapreduce_threshold(dev) + 5
721+
big_size = CUDA.serial_mapreduce_threshold(dev) + 5
722722
a = rand(Float32, big_size, 31)
723723
c = CuArray(a)
724724

0 commit comments

Comments
 (0)