From bc193c3936dcfe98dc1a8d46a949008771939232 Mon Sep 17 00:00:00 2001 From: Vikas Kurapati Date: Wed, 25 Feb 2026 14:21:10 +0100 Subject: [PATCH 1/3] Reduction for CUDA bug fix clang-format modified workgroup sizes --- algorithms/cudahip/Reduction.cpp | 157 ++++++++++++++++++++----------- 1 file changed, 104 insertions(+), 53 deletions(-) diff --git a/algorithms/cudahip/Reduction.cpp b/algorithms/cudahip/Reduction.cpp index 45e8873..4fb2489 100644 --- a/algorithms/cudahip/Reduction.cpp +++ b/algorithms/cudahip/Reduction.cpp @@ -10,6 +10,9 @@ #include namespace device { + +constexpr int WorkGroupSize = 1024; +constexpr int ItemsPerWorkItem = 4; template struct Sum { T defaultValue{0}; @@ -44,70 +47,104 @@ __forceinline__ __device__ T shuffledown(T value, int offset) { } #endif -// a rather "dumb", but general reduction kernel -// (not intended for intensive use; there's the thrust libraries instead) +// Warp reduce operation similar to SYCL +template +__device__ __forceinline__ T warpReduce(T value, OperationT operation) { -template -__launch_bounds__(1024) void __global__ kernel_reduce( - AccT* result, const VecT* vector, size_t size, bool overrideResult, OperationT operation) { - __shared__ AccT shmem[256]; - const auto warpCount = blockDim.x / warpSize; - const auto currentWarp = threadIdx.x / warpSize; - const auto threadInWarp = threadIdx.x % warpSize; - const auto warpsNeeded = (size + warpSize - 1) / warpSize; - - auto value = operation.defaultValue; - auto acc = operation.defaultValue; - -#pragma unroll 4 - for (std::size_t i = currentWarp; i < warpsNeeded; i += warpCount) { - const auto id = threadInWarp + i * warpSize; - const auto valueNew = - (id < size) ? static_cast(ntload(&vector[id])) : operation.defaultValue; - - value = operation(value, valueNew); - } - - for (int offset = 1; offset < warpSize; offset *= 2) { + for (int offset = warpSize / 2; offset > 0; offset /= 2) { value = operation(value, shuffledown(value, offset)); } + return value; +} + +// Helper function for Generic Atomic Update +// Fallback to atomicCAS-based implementation if atomic instruction is not available +// Picked from: https://docs.nvidia.com/cuda/cuda-c-programming-guide/#atomic-functions +template +__device__ __forceinline__ void atomicUpdate(T* address, T val, OperationT operation) { + unsigned long long* address_as_ull = (unsigned long long*)address; + unsigned long long old = *address_as_ull, assumed; + do { + assumed = old; + T calculatedRes = operation(*(T*)&assumed, val); + old = atomicCAS(address_as_ull, assumed, *(unsigned long long*)&calculatedRes); + } while (assumed != old); +} - acc = operation(acc, value); +// Native atomics +template <> +__device__ __forceinline__ void + atomicUpdate>(int* address, int val, device::Sum operation) { + atomicAdd(address, val); +} +template <> +__device__ __forceinline__ void atomicUpdate>( + float* address, float val, device::Sum operation) { + atomicAdd(address, val); +} +#if __CUDA_ARCH__ >= 600 +template <> +__device__ __forceinline__ void atomicUpdate>( + double* address, double val, device::Sum operation) { + atomicAdd(address, val); +} +#endif - if (threadInWarp == 0) { - shmem[currentWarp] = acc; - } +// Block Reduce +template +__device__ __forceinline__ T blockReduce(T val, T* shmem, OperationT operation) { + + const int laneId = threadIdx.x % warpSize; + const int warpId = threadIdx.x / warpSize; + val = warpReduce(val, operation); + if (laneId == 0) + shmem[warpId] = val; __syncthreads(); - if (currentWarp == 0) { - const auto lastWarpsNeeded = (warpCount + warpSize - 1) / warpSize; + const int numWarps = WorkGroupSize / warpSize; + val = (threadIdx.x < numWarps) ? shmem[laneId] : operation.defaultValue; - auto value = operation.defaultValue; - auto lastAcc = operation.defaultValue; + if (warpId == 0) + val = warpReduce(val, operation); -#pragma unroll 2 - for (int i = 0; i < lastWarpsNeeded; ++i) { - const auto id = threadInWarp + i * warpSize; - const auto valueNew = (id < warpCount) ? shmem[id] : operation.defaultValue; + return val; +} - value = operation(value, valueNew); - } +// Init Kernel to handle overrideResult safely across multiple blocks +template +__global__ void initKernel(T* result, OperationT operation) { + if (threadIdx.x == 0) { + *result = operation.defaultValue; + } +} - for (int offset = 1; offset < warpSize; offset *= 2) { - value = operation(value, shuffledown(value, offset)); - } +template +__launch_bounds__(WorkGroupSize) void __global__ kernel_reduce( + AccT* result, const VecT* vector, size_t size, bool overrideResult, OperationT operation) { - lastAcc = operation(lastAcc, value); + // Maximum block size 1024, warp size 32 so 1024/32 = 32 chosen + // For AMD, warp size 64, 1024/64 = 16, but 32 should work with a few idle memory addresses + __shared__ AccT shmem[32]; - if (threadIdx.x == 0) { - if (overrideResult) { - ntstore(result, lastAcc); - } else { - ntstore(result, operation(ntload(result), lastAcc)); - } + AccT threadAcc = operation.defaultValue; + size_t blockBaseIdx = blockIdx.x * (WorkGroupSize * ItemsPerWorkItem); + size_t threadBaseIdx = blockBaseIdx + threadIdx.x; + +#pragma unroll + for (int i = 0; i < ItemsPerWorkItem; i++) { + size_t idx = threadBaseIdx + i * WorkGroupSize; + if (idx < size) { + threadAcc = operation(threadAcc, static_cast(ntload(&vector[idx]))); } } + + AccT blockAcc = blockReduce(threadAcc, shmem, operation); + + if (threadIdx.x == 0) { + (void)overrideResult; // to silence unused parameter warning for non-Add reductions + atomicUpdate(result, blockAcc, operation); + } } template @@ -119,22 +156,36 @@ void Algorithms::reduceVector(AccT* result, void* streamPtr) { auto* stream = reinterpret_cast(streamPtr); - dim3 grid(1, 1, 1); - dim3 block(1024, 1, 1); + size_t totalItems = WorkGroupSize * ItemsPerWorkItem; + size_t numBlocks = (size + totalItems - 1) / totalItems; + + if (overrideResult) { + switch (type) { + case ReductionType::Add: + initKernel<<<1, 1, 0, stream>>>(result, device::Sum()); + break; + case ReductionType::Max: + initKernel<<<1, 1, 0, stream>>>(result, device::Max()); + break; + case ReductionType::Min: + initKernel<<<1, 1, 0, stream>>>(result, device::Min()); + break; + } + } switch (type) { case ReductionType::Add: { - kernel_reduce<<>>( + kernel_reduce<<>>( result, buffer, size, overrideResult, device::Sum()); break; } case ReductionType::Max: { - kernel_reduce<<>>( + kernel_reduce<<>>( result, buffer, size, overrideResult, device::Max()); break; } case ReductionType::Min: { - kernel_reduce<<>>( + kernel_reduce<<>>( result, buffer, size, overrideResult, device::Min()); break; } From 18a86c005f3364a742e775ece27fbcfc706abeb3 Mon Sep 17 00:00:00 2001 From: Vikas Kurapati Date: Mon, 2 Mar 2026 10:04:29 +0100 Subject: [PATCH 2/3] Address review comments: rename to NVIDIA convention, add brackets, make const - Rename WorkGroupSize -> BlockSize and ItemsPerWorkItem -> ItemsPerThread (WorkGroup/WorkItem are AMD/SYCL terminology; block/thread are NVIDIA terms) - Add curly brackets to if (laneId == 0) and if (warpId == 0) bodies in blockReduce - Make totalItems and numBlocks const in reduceVector --- algorithms/cudahip/Reduction.cpp | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/algorithms/cudahip/Reduction.cpp b/algorithms/cudahip/Reduction.cpp index 4fb2489..12f1bf7 100644 --- a/algorithms/cudahip/Reduction.cpp +++ b/algorithms/cudahip/Reduction.cpp @@ -11,8 +11,8 @@ namespace device { -constexpr int WorkGroupSize = 1024; -constexpr int ItemsPerWorkItem = 4; +constexpr int BlockSize = 1024; +constexpr int ItemsPerThread = 4; template struct Sum { T defaultValue{0}; @@ -98,15 +98,17 @@ __device__ __forceinline__ T blockReduce(T val, T* shmem, OperationT operation) const int warpId = threadIdx.x / warpSize; val = warpReduce(val, operation); - if (laneId == 0) + if (laneId == 0) { shmem[warpId] = val; + } __syncthreads(); - const int numWarps = WorkGroupSize / warpSize; + const int numWarps = BlockSize / warpSize; val = (threadIdx.x < numWarps) ? shmem[laneId] : operation.defaultValue; - if (warpId == 0) + if (warpId == 0) { val = warpReduce(val, operation); + } return val; } @@ -120,7 +122,7 @@ __global__ void initKernel(T* result, OperationT operation) { } template -__launch_bounds__(WorkGroupSize) void __global__ kernel_reduce( +__launch_bounds__(BlockSize) void __global__ kernel_reduce( AccT* result, const VecT* vector, size_t size, bool overrideResult, OperationT operation) { // Maximum block size 1024, warp size 32 so 1024/32 = 32 chosen @@ -128,12 +130,12 @@ __launch_bounds__(WorkGroupSize) void __global__ kernel_reduce( __shared__ AccT shmem[32]; AccT threadAcc = operation.defaultValue; - size_t blockBaseIdx = blockIdx.x * (WorkGroupSize * ItemsPerWorkItem); + size_t blockBaseIdx = blockIdx.x * (BlockSize * ItemsPerThread); size_t threadBaseIdx = blockBaseIdx + threadIdx.x; #pragma unroll - for (int i = 0; i < ItemsPerWorkItem; i++) { - size_t idx = threadBaseIdx + i * WorkGroupSize; + for (int i = 0; i < ItemsPerThread; i++) { + size_t idx = threadBaseIdx + i * BlockSize; if (idx < size) { threadAcc = operation(threadAcc, static_cast(ntload(&vector[idx]))); } @@ -156,8 +158,8 @@ void Algorithms::reduceVector(AccT* result, void* streamPtr) { auto* stream = reinterpret_cast(streamPtr); - size_t totalItems = WorkGroupSize * ItemsPerWorkItem; - size_t numBlocks = (size + totalItems - 1) / totalItems; + const size_t totalItems = BlockSize * ItemsPerThread; + const size_t numBlocks = (size + totalItems - 1) / totalItems; if (overrideResult) { switch (type) { @@ -175,17 +177,17 @@ void Algorithms::reduceVector(AccT* result, switch (type) { case ReductionType::Add: { - kernel_reduce<<>>( + kernel_reduce<<>>( result, buffer, size, overrideResult, device::Sum()); break; } case ReductionType::Max: { - kernel_reduce<<>>( + kernel_reduce<<>>( result, buffer, size, overrideResult, device::Max()); break; } case ReductionType::Min: { - kernel_reduce<<>>( + kernel_reduce<<>>( result, buffer, size, overrideResult, device::Min()); break; } From ffc30c423668330085c59c2858819cb408f4e0e5 Mon Sep 17 00:00:00 2001 From: Vikas Kurapati Date: Tue, 3 Mar 2026 11:40:17 +0100 Subject: [PATCH 3/3] vendor warp reduce functions Format and fix compilation fix remove hip for warp reduce functions --- algorithms/cudahip/Reduction.cpp | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/algorithms/cudahip/Reduction.cpp b/algorithms/cudahip/Reduction.cpp index 12f1bf7..1355b1b 100644 --- a/algorithms/cudahip/Reduction.cpp +++ b/algorithms/cudahip/Reduction.cpp @@ -51,6 +51,22 @@ __forceinline__ __device__ T shuffledown(T value, int offset) { template __device__ __forceinline__ T warpReduce(T value, OperationT operation) { +#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800) + // C++17 compile-time check to ensure we only use this for ints + if constexpr (std::is_same_v || std::is_same_v) { + + unsigned int mask = 0xFFFFFFFF; // 32-bit active thread mask + + if constexpr (std::is_same_v>) { + return __reduce_add_sync(mask, value); + } else if constexpr (std::is_same_v>) { + return __reduce_min_sync(mask, value); + } else if constexpr (std::is_same_v>) { + return __reduce_max_sync(mask, value); + } + } +#endif + for (int offset = warpSize / 2; offset > 0; offset /= 2) { value = operation(value, shuffledown(value, offset)); }