From bc193c3936dcfe98dc1a8d46a949008771939232 Mon Sep 17 00:00:00 2001
From: Vikas Kurapati <vikas.kurapati@tum.de>
Date: Wed, 25 Feb 2026 14:21:10 +0100
Subject: [PATCH 1/3] Reduction for CUDA

bug fix
clang-format
modified workgroup sizes
---
 algorithms/cudahip/Reduction.cpp | 157 ++++++++++++++++++++-----------
 1 file changed, 104 insertions(+), 53 deletions(-)
diff --git a/algorithms/cudahip/Reduction.cpp b/algorithms/cudahip/Reduction.cpp
index 45e8873..4fb2489 100644
--- a/algorithms/cudahip/Reduction.cpp
+++ b/algorithms/cudahip/Reduction.cpp
@@ -10,6 +10,9 @@
 #include <math.h>
 
 namespace device {
+
+constexpr int WorkGroupSize = 1024;
+constexpr int ItemsPerWorkItem = 4;
 template <typename T>
 struct Sum {
   T defaultValue{0};
@@ -44,70 +47,104 @@ __forceinline__ __device__ T shuffledown(T value, int offset) {
 }
 #endif
 
-// a rather "dumb", but general reduction kernel
-// (not intended for intensive use; there's the thrust libraries instead)
+// Warp reduce operation similar to SYCL
+template <typename T, typename OperationT>
+__device__ __forceinline__ T warpReduce(T value, OperationT operation) {
 
-template <typename AccT, typename VecT, typename OperationT>
-__launch_bounds__(1024) void __global__ kernel_reduce(
-    AccT* result, const VecT* vector, size_t size, bool overrideResult, OperationT operation) {
-  __shared__ AccT shmem[256];
-  const auto warpCount = blockDim.x / warpSize;
-  const auto currentWarp = threadIdx.x / warpSize;
-  const auto threadInWarp = threadIdx.x % warpSize;
-  const auto warpsNeeded = (size + warpSize - 1) / warpSize;
-
-  auto value = operation.defaultValue;
-  auto acc = operation.defaultValue;
-
-#pragma unroll 4
-  for (std::size_t i = currentWarp; i < warpsNeeded; i += warpCount) {
-    const auto id = threadInWarp + i * warpSize;
-    const auto valueNew =
-        (id < size) ? static_cast<AccT>(ntload(&vector[id])) : operation.defaultValue;
-
-    value = operation(value, valueNew);
-  }
-
-  for (int offset = 1; offset < warpSize; offset *= 2) {
+  for (int offset = warpSize / 2; offset > 0; offset /= 2) {
     value = operation(value, shuffledown(value, offset));
   }
+  return value;
+}
+
+// Helper function for Generic Atomic Update
+// Fallback to atomicCAS-based implementation if atomic instruction is not available
+// Picked from: https://docs.nvidia.com/cuda/cuda-c-programming-guide/#atomic-functions
+template <typename T, typename OperationT>
+__device__ __forceinline__ void atomicUpdate(T* address, T val, OperationT operation) {
+  unsigned long long* address_as_ull = (unsigned long long*)address;
+  unsigned long long old = *address_as_ull, assumed;
+  do {
+    assumed = old;
+    T calculatedRes = operation(*(T*)&assumed, val);
+    old = atomicCAS(address_as_ull, assumed, *(unsigned long long*)&calculatedRes);
+  } while (assumed != old);
+}
 
-  acc = operation(acc, value);
+// Native atomics
+template <>
+__device__ __forceinline__ void
+    atomicUpdate<int, device::Sum<int>>(int* address, int val, device::Sum<int> operation) {
+  atomicAdd(address, val);
+}
+template <>
+__device__ __forceinline__ void atomicUpdate<float, device::Sum<float>>(
+    float* address, float val, device::Sum<float> operation) {
+  atomicAdd(address, val);
+}
+#if __CUDA_ARCH__ >= 600
+template <>
+__device__ __forceinline__ void atomicUpdate<double, device::Sum<double>>(
+    double* address, double val, device::Sum<double> operation) {
+  atomicAdd(address, val);
+}
+#endif
 
-  if (threadInWarp == 0) {
-    shmem[currentWarp] = acc;
-  }
+// Block Reduce
+template <typename T, typename OperationT>
+__device__ __forceinline__ T blockReduce(T val, T* shmem, OperationT operation) {
+
+  const int laneId = threadIdx.x % warpSize;
+  const int warpId = threadIdx.x / warpSize;
 
+  val = warpReduce(val, operation);
+  if (laneId == 0)
+    shmem[warpId] = val;
   __syncthreads();
 
-  if (currentWarp == 0) {
-    const auto lastWarpsNeeded = (warpCount + warpSize - 1) / warpSize;
+  const int numWarps = WorkGroupSize / warpSize;
+  val = (threadIdx.x < numWarps) ? shmem[laneId] : operation.defaultValue;
 
-    auto value = operation.defaultValue;
-    auto lastAcc = operation.defaultValue;
+  if (warpId == 0)
+    val = warpReduce(val, operation);
 
-#pragma unroll 2
-    for (int i = 0; i < lastWarpsNeeded; ++i) {
-      const auto id = threadInWarp + i * warpSize;
-      const auto valueNew = (id < warpCount) ? shmem[id] : operation.defaultValue;
+  return val;
+}
 
-      value = operation(value, valueNew);
-    }
+// Init Kernel to handle overrideResult safely across multiple blocks
+template <typename T, typename OperationT>
+__global__ void initKernel(T* result, OperationT operation) {
+  if (threadIdx.x == 0) {
+    *result = operation.defaultValue;
+  }
+}
 
-    for (int offset = 1; offset < warpSize; offset *= 2) {
-      value = operation(value, shuffledown(value, offset));
-    }
+template <typename AccT, typename VecT, typename OperationT>
+__launch_bounds__(WorkGroupSize) void __global__ kernel_reduce(
+    AccT* result, const VecT* vector, size_t size, bool overrideResult, OperationT operation) {
 
-    lastAcc = operation(lastAcc, value);
+  // Maximum block size 1024, warp size 32 so 1024/32 = 32 chosen
+  // For AMD, warp size 64, 1024/64 = 16, but 32 should work with a few idle memory addresses
+  __shared__ AccT shmem[32];
 
-    if (threadIdx.x == 0) {
-      if (overrideResult) {
-        ntstore(result, lastAcc);
-      } else {
-        ntstore(result, operation(ntload(result), lastAcc));
-      }
+  AccT threadAcc = operation.defaultValue;
+  size_t blockBaseIdx = blockIdx.x * (WorkGroupSize * ItemsPerWorkItem);
+  size_t threadBaseIdx = blockBaseIdx + threadIdx.x;
+
+#pragma unroll
+  for (int i = 0; i < ItemsPerWorkItem; i++) {
+    size_t idx = threadBaseIdx + i * WorkGroupSize;
+    if (idx < size) {
+      threadAcc = operation(threadAcc, static_cast<AccT>(ntload(&vector[idx])));
     }
   }
+
+  AccT blockAcc = blockReduce<AccT, OperationT>(threadAcc, shmem, operation);
+
+  if (threadIdx.x == 0) {
+    (void)overrideResult; // to silence unused parameter warning for non-Add reductions
+    atomicUpdate(result, blockAcc, operation);
+  }
 }
 
 template <typename AccT, typename VecT>
@@ -119,22 +156,36 @@ void Algorithms::reduceVector(AccT* result,
                               void* streamPtr) {
   auto* stream = reinterpret_cast<internals::DeviceStreamT>(streamPtr);
 
-  dim3 grid(1, 1, 1);
-  dim3 block(1024, 1, 1);
+  size_t totalItems = WorkGroupSize * ItemsPerWorkItem;
+  size_t numBlocks = (size + totalItems - 1) / totalItems;
+
+  if (overrideResult) {
+    switch (type) {
+    case ReductionType::Add:
+      initKernel<<<1, 1, 0, stream>>>(result, device::Sum<AccT>());
+      break;
+    case ReductionType::Max:
+      initKernel<<<1, 1, 0, stream>>>(result, device::Max<AccT>());
+      break;
+    case ReductionType::Min:
+      initKernel<<<1, 1, 0, stream>>>(result, device::Min<AccT>());
+      break;
+    }
+  }
 
   switch (type) {
   case ReductionType::Add: {
-    kernel_reduce<<<grid, block, 0, stream>>>(
+    kernel_reduce<<<numBlocks, WorkGroupSize, 0, stream>>>(
         result, buffer, size, overrideResult, device::Sum<AccT>());
     break;
   }
   case ReductionType::Max: {
-    kernel_reduce<<<grid, block, 0, stream>>>(
+    kernel_reduce<<<numBlocks, WorkGroupSize, 0, stream>>>(
         result, buffer, size, overrideResult, device::Max<AccT>());
     break;
   }
   case ReductionType::Min: {
-    kernel_reduce<<<grid, block, 0, stream>>>(
+    kernel_reduce<<<numBlocks, WorkGroupSize, 0, stream>>>(
         result, buffer, size, overrideResult, device::Min<AccT>());
     break;
   }

From 18a86c005f3364a742e775ece27fbcfc706abeb3 Mon Sep 17 00:00:00 2001
From: Vikas Kurapati <vikas.kurapati@tum.de>
Date: Mon, 2 Mar 2026 10:04:29 +0100
Subject: [PATCH 2/3] Address review comments: rename to NVIDIA convention, add
 brackets, make const

- Rename WorkGroupSize -> BlockSize and ItemsPerWorkItem -> ItemsPerThread
  (WorkGroup/WorkItem are AMD/SYCL terminology; block/thread are NVIDIA terms)
- Add curly brackets to if (laneId == 0) and if (warpId == 0) bodies in blockReduce
- Make totalItems and numBlocks const in reduceVector
---
 algorithms/cudahip/Reduction.cpp | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/algorithms/cudahip/Reduction.cpp b/algorithms/cudahip/Reduction.cpp
index 4fb2489..12f1bf7 100644
--- a/algorithms/cudahip/Reduction.cpp
+++ b/algorithms/cudahip/Reduction.cpp
@@ -11,8 +11,8 @@
 
 namespace device {
 
-constexpr int WorkGroupSize = 1024;
-constexpr int ItemsPerWorkItem = 4;
+constexpr int BlockSize = 1024;
+constexpr int ItemsPerThread = 4;
 template <typename T>
 struct Sum {
   T defaultValue{0};
@@ -98,15 +98,17 @@ __device__ __forceinline__ T blockReduce(T val, T* shmem, OperationT operation)
   const int warpId = threadIdx.x / warpSize;
 
   val = warpReduce(val, operation);
-  if (laneId == 0)
+  if (laneId == 0) {
     shmem[warpId] = val;
+  }
   __syncthreads();
 
-  const int numWarps = WorkGroupSize / warpSize;
+  const int numWarps = BlockSize / warpSize;
   val = (threadIdx.x < numWarps) ? shmem[laneId] : operation.defaultValue;
 
-  if (warpId == 0)
+  if (warpId == 0) {
     val = warpReduce(val, operation);
+  }
 
   return val;
 }
@@ -120,7 +122,7 @@ __global__ void initKernel(T* result, OperationT operation) {
 }
 
 template <typename AccT, typename VecT, typename OperationT>
-__launch_bounds__(WorkGroupSize) void __global__ kernel_reduce(
+__launch_bounds__(BlockSize) void __global__ kernel_reduce(
     AccT* result, const VecT* vector, size_t size, bool overrideResult, OperationT operation) {
 
   // Maximum block size 1024, warp size 32 so 1024/32 = 32 chosen
@@ -128,12 +130,12 @@ __launch_bounds__(WorkGroupSize) void __global__ kernel_reduce(
   __shared__ AccT shmem[32];
 
   AccT threadAcc = operation.defaultValue;
-  size_t blockBaseIdx = blockIdx.x * (WorkGroupSize * ItemsPerWorkItem);
+  size_t blockBaseIdx = blockIdx.x * (BlockSize * ItemsPerThread);
   size_t threadBaseIdx = blockBaseIdx + threadIdx.x;
 
 #pragma unroll
-  for (int i = 0; i < ItemsPerWorkItem; i++) {
-    size_t idx = threadBaseIdx + i * WorkGroupSize;
+  for (int i = 0; i < ItemsPerThread; i++) {
+    size_t idx = threadBaseIdx + i * BlockSize;
     if (idx < size) {
       threadAcc = operation(threadAcc, static_cast<AccT>(ntload(&vector[idx])));
     }
@@ -156,8 +158,8 @@ void Algorithms::reduceVector(AccT* result,
                               void* streamPtr) {
   auto* stream = reinterpret_cast<internals::DeviceStreamT>(streamPtr);
 
-  size_t totalItems = WorkGroupSize * ItemsPerWorkItem;
-  size_t numBlocks = (size + totalItems - 1) / totalItems;
+  const size_t totalItems = BlockSize * ItemsPerThread;
+  const size_t numBlocks = (size + totalItems - 1) / totalItems;
 
   if (overrideResult) {
     switch (type) {
@@ -175,17 +177,17 @@ void Algorithms::reduceVector(AccT* result,
 
   switch (type) {
   case ReductionType::Add: {
-    kernel_reduce<<<numBlocks, WorkGroupSize, 0, stream>>>(
+    kernel_reduce<<<numBlocks, BlockSize, 0, stream>>>(
         result, buffer, size, overrideResult, device::Sum<AccT>());
     break;
   }
   case ReductionType::Max: {
-    kernel_reduce<<<numBlocks, WorkGroupSize, 0, stream>>>(
+    kernel_reduce<<<numBlocks, BlockSize, 0, stream>>>(
         result, buffer, size, overrideResult, device::Max<AccT>());
     break;
   }
   case ReductionType::Min: {
-    kernel_reduce<<<numBlocks, WorkGroupSize, 0, stream>>>(
+    kernel_reduce<<<numBlocks, BlockSize, 0, stream>>>(
         result, buffer, size, overrideResult, device::Min<AccT>());
     break;
   }

From ffc30c423668330085c59c2858819cb408f4e0e5 Mon Sep 17 00:00:00 2001
From: Vikas Kurapati <vikas.kurapati@tum.de>
Date: Tue, 3 Mar 2026 11:40:17 +0100
Subject: [PATCH 3/3] vendor warp reduce functions

Format and fix
compilation fix
remove hip for warp reduce functions
---
 algorithms/cudahip/Reduction.cpp | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/algorithms/cudahip/Reduction.cpp b/algorithms/cudahip/Reduction.cpp
index 12f1bf7..1355b1b 100644
--- a/algorithms/cudahip/Reduction.cpp
+++ b/algorithms/cudahip/Reduction.cpp
@@ -51,6 +51,22 @@ __forceinline__ __device__ T shuffledown(T value, int offset) {
 template <typename T, typename OperationT>
 __device__ __forceinline__ T warpReduce(T value, OperationT operation) {
 
+#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800)
+  // C++17 compile-time check to ensure we only use this for ints
+  if constexpr (std::is_same_v<T, int> || std::is_same_v<T, unsigned int>) {
+
+    unsigned int mask = 0xFFFFFFFF; // 32-bit active thread mask
+
+    if constexpr (std::is_same_v<OperationT, device::Sum<T>>) {
+      return __reduce_add_sync(mask, value);
+    } else if constexpr (std::is_same_v<OperationT, device::Min<T>>) {
+      return __reduce_min_sync(mask, value);
+    } else if constexpr (std::is_same_v<OperationT, device::Max<T>>) {
+      return __reduce_max_sync(mask, value);
+    }
+  }
+#endif
+
   for (int offset = warpSize / 2; offset > 0; offset /= 2) {
     value = operation(value, shuffledown(value, offset));
   }