|
1 | 1 | ## COV_EXCL_START |
2 | 2 |
|
3 | 3 | # TODO |
4 | | -# - serial version for lower latency |
5 | 4 | # - block-stride loop to delay need for second kernel launch |
6 | 5 |
|
7 | 6 | # Reduce a value across a warp |
@@ -134,7 +133,7 @@ function partial_mapreduce_grid(f, op, neutral, Rreduce, Rother, shuffle, R::Abs |
134 | 133 | return |
135 | 134 | end |
136 | 135 |
|
137 | | -function big_mapreduce_kernel(f, op, neutral, Rreduce, Rother, R, As) |
| 136 | +function serial_mapreduce_kernel(f, op, neutral, Rreduce, Rother, R, As) |
138 | 137 | grid_idx = threadIdx().x + (blockIdx().x - 1i32) * blockDim().x |
139 | 138 | @inbounds if grid_idx <= length(Rother) |
140 | 139 | Iother = Rother[grid_idx] |
|
160 | 159 | ## COV_EXCL_STOP |
161 | 160 |
|
162 | 161 | # factored out for use in tests |
163 | | -function big_mapreduce_threshold(dev) |
| 162 | +function serial_mapreduce_threshold(dev) |
164 | 163 | max_concurrency = attribute(dev, DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK) * |
165 | 164 | attribute(dev, DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT) |
166 | 165 | return max_concurrency |
@@ -197,9 +196,9 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T}, |
197 | 196 | @assert length(Rother) > 0 |
198 | 197 |
|
199 | 198 | # If `Rother` is large enough, then a naive loop is more efficient than partial reductions. |
200 | | - if length(Rother) >= big_mapreduce_threshold(dev) |
| 199 | + if length(Rother) >= serial_mapreduce_threshold(dev) |
201 | 200 | args = (f, op, init, Rreduce, Rother, R, A) |
202 | | - kernel = @cuda launch=false big_mapreduce_kernel(args...) |
| 201 | + kernel = @cuda launch=false serial_mapreduce_kernel(args...) |
203 | 202 | kernel_config = launch_configuration(kernel.fun) |
204 | 203 | threads = kernel_config.threads |
205 | 204 | blocks = cld(length(Rother), threads) |
@@ -232,13 +231,62 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T}, |
232 | 231 | reduce_threads = compute_threads(kernel_config.threads) |
233 | 232 | reduce_shmem = compute_shmem(reduce_threads) |
234 | 233 |
|
| 234 | + # how many blocks should we launch? |
| 235 | + # |
| 236 | + # even though we can always reduce each slice in a single thread block, that may not be |
| 237 | + # optimal as it might not saturate the GPU. we already launch some blocks to process |
| 238 | + # independent dimensions in parallel; pad that number to ensure full occupancy. |
| 239 | + other_blocks = length(Rother) |
| 240 | + reduce_blocks = if other_blocks >= kernel_config.blocks |
| 241 | + 1 |
| 242 | + else |
| 243 | + min(cld(length(Rreduce), reduce_threads), # how many we need at most |
| 244 | + cld(kernel_config.blocks, other_blocks)) # maximize occupancy |
| 245 | + end |
| 246 | + |
235 | 247 | # determine the launch configuration |
236 | 248 | threads = reduce_threads |
237 | 249 | shmem = reduce_shmem |
238 | | - blocks = length(Rother) |
| 250 | + blocks = reduce_blocks*other_blocks |
239 | 251 |
|
240 | 252 | # perform the actual reduction |
241 | | - kernel(f, op, init, Rreduce, Rother, Val(shuffle), R, A; threads, blocks, shmem) |
| 253 | + if reduce_blocks == 1 |
| 254 | + # we can cover the dimensions to reduce using a single block |
| 255 | + kernel(f, op, init, Rreduce, Rother, Val(shuffle), R, A; threads, blocks, shmem) |
| 256 | + else |
| 257 | + # TODO: provide a version that atomically reduces from different blocks |
| 258 | + |
| 259 | + # temporary empty array whose type will match the final partial array |
| 260 | + partial = similar(R, ntuple(_ -> 0, Val(ndims(R)+1))) |
| 261 | + |
| 262 | + # NOTE: we can't use the previously-compiled kernel, or its launch configuration, |
| 263 | + # since the type of `partial` might not match the original output container |
| 264 | + # (e.g. if that was a view). |
| 265 | + partial_kernel = @cuda launch=false partial_mapreduce_grid(f, op, init, Rreduce, Rother, Val(shuffle), partial, A) |
| 266 | + partial_kernel_config = launch_configuration(partial_kernel.fun; shmem=compute_shmem∘compute_threads) |
| 267 | + partial_reduce_threads = compute_threads(partial_kernel_config.threads) |
| 268 | + partial_reduce_shmem = compute_shmem(partial_reduce_threads) |
| 269 | + partial_reduce_blocks = if other_blocks >= partial_kernel_config.blocks |
| 270 | + 1 |
| 271 | + else |
| 272 | + min(cld(length(Rreduce), partial_reduce_threads), |
| 273 | + cld(partial_kernel_config.blocks, other_blocks)) |
| 274 | + end |
| 275 | + partial_threads = partial_reduce_threads |
| 276 | + partial_shmem = partial_reduce_shmem |
| 277 | + partial_blocks = partial_reduce_blocks*other_blocks |
| 278 | + |
| 279 | + partial = similar(R, (size(R)..., partial_blocks)) |
| 280 | + if init === nothing |
| 281 | + # without an explicit initializer we need to copy from the output container |
| 282 | + partial .= R |
| 283 | + end |
| 284 | + |
| 285 | + partial_kernel(f, op, init, Rreduce, Rother, Val(shuffle), partial, A; |
| 286 | + threads=partial_threads, blocks=partial_blocks, shmem=partial_shmem) |
| 287 | + |
| 288 | + GPUArrays.mapreducedim!(identity, op, R, partial; init) |
| 289 | + end |
242 | 290 |
|
243 | 291 | return R |
244 | 292 | end |
0 commit comments