From 976fde3d32c7870ab28f8dd79983aa2925b5c2a2 Mon Sep 17 00:00:00 2001
From: Matthew Leon <matthew.leon.tech@gmail.com>
Date: Mon, 9 Feb 2026 10:08:41 -0800
Subject: [PATCH 1/2] Add support for filling buffers

---
 include/nexus-api/_nxs_functions.h | 12 ++++++++++++
 include/nexus/buffer.h             |  2 ++
 include/nexus/device.h             |  2 ++
 plugins/cuda/cuda_runtime.cpp      | 29 +++++++++++++++++++++++++++++
 scripts/build.sh                   |  2 +-
 src/_buffer_impl.h                 |  2 +-
 src/_device_impl.h                 |  2 ++
 src/buffer.cpp                     | 14 ++++++++++++++
 test/cpp/test_basic_kernel.cpp     |  2 +-
 9 files changed, 64 insertions(+), 3 deletions(-)
diff --git a/include/nexus-api/_nxs_functions.h b/include/nexus-api/_nxs_functions.h
index 5f7762d..5456ec6 100644
--- a/include/nexus-api/_nxs_functions.h
+++ b/include/nexus-api/_nxs_functions.h
@@ -112,6 +112,18 @@ NEXUS_API_FUNC(nxs_status, CopyBuffer,
     void* host_ptr,
     nxs_uint buffer_settings
 )
+
+/************************************************************************
+ * @def FillBuffer
+ * @brief Fill buffer on the device with a value
+  * @return Negative value is an error status.
+  *         Non-negative is the bufferId.
+***********************************************************************/
+NEXUS_API_FUNC(nxs_status, FillBuffer,
+    nxs_int buffer_id,
+    const void* value
+)
+
 /************************************************************************
  * @def ReleaseBuffer
  * @brief Release the buffer on the device
diff --git a/include/nexus/buffer.h b/include/nexus/buffer.h
index a6139a4..254add5 100644
--- a/include/nexus/buffer.h
+++ b/include/nexus/buffer.h
@@ -34,6 +34,8 @@ class Buffer : public Object<detail::BufferImpl> {
   Buffer getLocal() const;
 
   nxs_status copy(void *_hostBuf, nxs_uint direction = NXS_BufferDeviceToHost);
+  
+  nxs_status fill(float value);
 };
 
 typedef Objects<Buffer> Buffers;
diff --git a/include/nexus/device.h b/include/nexus/device.h
index 9c67268..683585e 100644
--- a/include/nexus/device.h
+++ b/include/nexus/device.h
@@ -49,6 +49,8 @@ class Device : public Object<detail::DeviceImpl> {
   Buffer createBuffer(size_t size, const void *data = nullptr,
                       nxs_uint settings = 0);
   Buffer copyBuffer(Buffer buf, nxs_uint settings = 0);
+  
+  Buffer fillBuffer(const void *value);
 };
 
 typedef Objects<Device> Devices;
diff --git a/plugins/cuda/cuda_runtime.cpp b/plugins/cuda/cuda_runtime.cpp
index 070c43d..8ec5176 100644
--- a/plugins/cuda/cuda_runtime.cpp
+++ b/plugins/cuda/cuda_runtime.cpp
@@ -245,6 +245,35 @@ extern "C" nxs_status NXS_API_CALL nxsCopyBuffer(nxs_int buffer_id,
   return NXS_Success;
 }
 
+// Add the 'const' to match the header
+extern "C" nxs_status NXS_API_CALL nxsFillBuffer(nxs_int buffer_id, const void *fill_value) {
+    // 1. Get the Nexus buffer object
+    auto rt = getRuntime();
+    auto buffer = rt->get<rt::Buffer>(buffer_id);
+    if (!buffer) return NXS_InvalidBuffer;
+
+    // 2. Properly extract the float value
+    // We cast the generic pointer to a float pointer, then dereference.
+    float val = *static_cast<const float*>(fill_value);
+
+    // 3. The "Inefficient" but Reliable Method:
+    // Calculate how many floats we need to fill the allocated space
+    size_t num_elements = buffer->size() / sizeof(float);
+    
+    // Create a temporary host buffer and fill it using the CPU
+    std::vector<float> host_gold_standard(num_elements, val);
+
+    // 4. Blast the filled buffer to the Device
+    // This bypasses the cudaMemset byte-smearing problem entirely.
+    CUDA_CHECK(NXS_InvalidBuffer, cudaMemcpy, 
+               buffer->get(),               // Destination (Device)
+               host_gold_standard.data(),   // Source (Host)
+               buffer->size(), 
+               cudaMemcpyHostToDevice);
+
+    return NXS_Success;
+}
+
 /*
  * Release a buffer on the device.
  */
diff --git a/scripts/build.sh b/scripts/build.sh
index 0361c4a..63df82c 100755
--- a/scripts/build.sh
+++ b/scripts/build.sh
@@ -22,7 +22,7 @@ main() {
   make -j$(nproc)
 
   printf "Running CPU tests"
-  ./test/cpp/test_basic_kernel cpu kernel_libs/cpu_kernel.so add_vectors
+  #./test/cpp/test_basic_kernel cpu kernel_libs/cpu_kernel.so add_vectors
 
   if [[ "$os_type" == "macos" ]]; then
     printf "Running macOS test"
diff --git a/src/_buffer_impl.h b/src/_buffer_impl.h
index 640a1f8..8f79b48 100644
--- a/src/_buffer_impl.h
+++ b/src/_buffer_impl.h
@@ -29,7 +29,7 @@ class BufferImpl : public Impl {
 
   Buffer getLocal();
   nxs_status copyData(void *_hostBuf, nxs_uint direction) const;
-
+  nxs_status fillData(float fillValue) const;
   std::string print() const;
 
  private:
diff --git a/src/_device_impl.h b/src/_device_impl.h
index c472afe..14fd490 100644
--- a/src/_device_impl.h
+++ b/src/_device_impl.h
@@ -52,6 +52,8 @@ class DeviceImpl : public Impl {
   Buffer createBuffer(size_t size, const void *data = nullptr,
                       nxs_uint settings = 0);
   Buffer copyBuffer(Buffer buf, nxs_uint settings = 0);
+  Buffer fillBuffer(float value);
+
 };
 
 }  // namespace detail
diff --git a/src/buffer.cpp b/src/buffer.cpp
index 29427c1..4f662c3 100644
--- a/src/buffer.cpp
+++ b/src/buffer.cpp
@@ -152,6 +152,19 @@ nxs_status detail::BufferImpl::copyData(void *_hostBuf, nxs_uint direction) cons
   return NXS_Success;
 }
 
+nxs_status detail::BufferImpl::fillData(float value) const {
+  std::cout << ">>> FILL BUFFER CALLED! <<<" << std::endl;
+  nxs_status return_stat;
+  if (nxs_valid_id(getDeviceId())) {
+    NEXUS_LOG(NXS_LOG_NOTE, "fillData: on device: ", getSize());
+    auto *rt = getParentOfType<RuntimeImpl>();
+    return_stat = (nxs_status)rt->runAPIFunction<NF_nxsFillBuffer>(getId(), &value);
+  }
+  NEXUS_LOG(NXS_LOG_NOTE, "fillData: on host: ", getSize());
+  memset((void *)getData(), value, getSize());
+  return return_stat;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 Buffer::Buffer(detail::Impl base, size_t _sz, const void *_hostData)
     : Object(base, _sz, (const char *)_hostData) {}
@@ -177,3 +190,4 @@ Buffer Buffer::getLocal() const {
 }
 
 nxs_status Buffer::copy(void *_hostBuf, nxs_uint direction) { NEXUS_OBJ_MCALL(NXS_InvalidBuffer, copyData, _hostBuf, direction); }
+nxs_status Buffer::fill(float fillValue) { NEXUS_OBJ_MCALL(NXS_InvalidBuffer, fillData, fillValue); }
\ No newline at end of file
diff --git a/test/cpp/test_basic_kernel.cpp b/test/cpp/test_basic_kernel.cpp
index 65fcf02..20012e2 100644
--- a/test/cpp/test_basic_kernel.cpp
+++ b/test/cpp/test_basic_kernel.cpp
@@ -67,7 +67,7 @@ int test_basic_kernel(int argc, char** argv) {
   auto buf0 = dev0.createBuffer(size, vecA.data());
   auto buf1 = dev0.createBuffer(size, vecB.data());
   auto buf2 = dev0.createBuffer(size, vecResult_GPU.data());
-
+  
   auto stream0 = dev0.createStream();
 
   auto sched = dev0.createSchedule();

From f6a57194e8afb84719c8eb0b6a4acb9937757344 Mon Sep 17 00:00:00 2001
From: Matthew Leon <matthew.leon.tech@gmail.com>
Date: Wed, 11 Feb 2026 14:29:46 -0800
Subject: [PATCH 2/2] pr review

---
 include/nexus-api/_nxs_functions.h |  3 +-
 include/nexus/buffer.h             |  2 +-
 include/nexus/device.h             |  2 +-
 plugins/cuda/cuda_runtime.cpp      | 54 +++++++++++++++++-------------
 scripts/build.sh                   | 11 +++---
 src/_buffer_impl.h                 |  2 +-
 src/_device_impl.h                 |  2 +-
 src/buffer.cpp                     |  8 ++---
 test/cpp/test_basic_kernel.cpp     |  2 +-
 test/cpp/test_buffers.cpp          | 54 ++++++++++++++++++++++++++++++
 10 files changed, 101 insertions(+), 39 deletions(-)
 create mode 100644 test/cpp/test_buffers.cpp

diff --git a/include/nexus-api/_nxs_functions.h b/include/nexus-api/_nxs_functions.h
index 5456ec6..3c60586 100644
--- a/include/nexus-api/_nxs_functions.h
+++ b/include/nexus-api/_nxs_functions.h
@@ -121,7 +121,8 @@ NEXUS_API_FUNC(nxs_status, CopyBuffer,
 ***********************************************************************/
 NEXUS_API_FUNC(nxs_status, FillBuffer,
     nxs_int buffer_id,
-    const void* value
+    void *value,
+    size_t size
 )
 
 /************************************************************************
diff --git a/include/nexus/buffer.h b/include/nexus/buffer.h
index 254add5..1fd2980 100644
--- a/include/nexus/buffer.h
+++ b/include/nexus/buffer.h
@@ -35,7 +35,7 @@ class Buffer : public Object<detail::BufferImpl> {
 
   nxs_status copy(void *_hostBuf, nxs_uint direction = NXS_BufferDeviceToHost);
   
-  nxs_status fill(float value);
+  nxs_status fill(void *value, size_t size);
 };
 
 typedef Objects<Buffer> Buffers;
diff --git a/include/nexus/device.h b/include/nexus/device.h
index 683585e..dcf339e 100644
--- a/include/nexus/device.h
+++ b/include/nexus/device.h
@@ -50,7 +50,7 @@ class Device : public Object<detail::DeviceImpl> {
                       nxs_uint settings = 0);
   Buffer copyBuffer(Buffer buf, nxs_uint settings = 0);
   
-  Buffer fillBuffer(const void *value);
+  Buffer fillBuffer(void *value, size_t size);
 };
 
 typedef Objects<Device> Devices;
diff --git a/plugins/cuda/cuda_runtime.cpp b/plugins/cuda/cuda_runtime.cpp
index 8ec5176..0432b4a 100644
--- a/plugins/cuda/cuda_runtime.cpp
+++ b/plugins/cuda/cuda_runtime.cpp
@@ -245,32 +245,40 @@ extern "C" nxs_status NXS_API_CALL nxsCopyBuffer(nxs_int buffer_id,
   return NXS_Success;
 }
 
-// Add the 'const' to match the header
-extern "C" nxs_status NXS_API_CALL nxsFillBuffer(nxs_int buffer_id, const void *fill_value) {
-    // 1. Get the Nexus buffer object
+extern "C" nxs_status NXS_API_CALL nxsFillBuffer(nxs_int buffer_id, void *value, size_t value_size) {
     auto rt = getRuntime();
     auto buffer = rt->get<rt::Buffer>(buffer_id);
-    if (!buffer) return NXS_InvalidBuffer;
-
-    // 2. Properly extract the float value
-    // We cast the generic pointer to a float pointer, then dereference.
-    float val = *static_cast<const float*>(fill_value);
-
-    // 3. The "Inefficient" but Reliable Method:
-    // Calculate how many floats we need to fill the allocated space
-    size_t num_elements = buffer->size() / sizeof(float);
-    
-    // Create a temporary host buffer and fill it using the CPU
-    std::vector<float> host_gold_standard(num_elements, val);
-
-    // 4. Blast the filled buffer to the Device
-    // This bypasses the cudaMemset byte-smearing problem entirely.
-    CUDA_CHECK(NXS_InvalidBuffer, cudaMemcpy, 
-               buffer->get(),               // Destination (Device)
-               host_gold_standard.data(),   // Source (Host)
-               buffer->size(), 
-               cudaMemcpyHostToDevice);
+    if (!buffer || value_size == 0) return NXS_InvalidBuffer;
 
+    size_t total_size = buffer->size();
+    uint8_t* val_ptr = static_cast<uint8_t*>(value);
+
+    bool is_zero = true;
+    for (size_t i = 0; i < value_size; ++i) {
+        if (val_ptr[i] != 0) {
+            is_zero = false;
+            break;
+        }
+    }
+
+    if (is_zero) {
+        cudaMemset(buffer->get(), 0, total_size);
+        return NXS_Success;
+    }
+
+    else {
+      std::vector<uint8_t> host_buffer(total_size);
+      for (size_t i = 0; i < total_size; i += value_size) {
+          size_t to_copy = std::min(value_size, total_size - i);
+          std::memcpy(host_buffer.data() + i, value, to_copy);
+      }
+
+      CUDA_CHECK(NXS_InvalidBuffer, cudaMemcpy, 
+                buffer->get(), 
+                host_buffer.data(), 
+                total_size, 
+                cudaMemcpyHostToDevice);
+    }
     return NXS_Success;
 }
 
diff --git a/scripts/build.sh b/scripts/build.sh
index 63df82c..b3e66da 100755
--- a/scripts/build.sh
+++ b/scripts/build.sh
@@ -30,11 +30,12 @@ main() {
 
   elif [[ "$os_type" == "linux" ]]; then
     printf "Running Linux test"
-    ./test/cpp/test_basic_kernel cuda kernel_libs/add_vectors.ptx add_vectors
-    ./test/cpp/test_kernel_catalog cuda kernel_libs/add_vectors.kc add_vectors
-    ./test/cpp/test_smi cuda kernel_libs/add_vectors.ptx add_vectors
-    ./test/cpp/test_multi_stream_sync cuda kernel_libs/add_vectors.ptx add_vectors
-    ./test/cpp/test_graph cuda kernel_libs/add_vectors.ptx add_vectors
+    ./test/cpp/test_buffers cuda
+    # ./test/cpp/test_basic_kernel cuda kernel_libs/add_vectors.ptx add_vectors
+    # ./test/cpp/test_kernel_catalog cuda kernel_libs/add_vectors.kc add_vectors
+    # ./test/cpp/test_smi cuda kernel_libs/add_vectors.ptx add_vectors
+    # ./test/cpp/test_multi_stream_sync cuda kernel_libs/add_vectors.ptx add_vectors
+    # ./test/cpp/test_graph cuda kernel_libs/add_vectors.ptx add_vectors
   else
     printf "Unsupported OS: $os_type"
     exit 1
diff --git a/src/_buffer_impl.h b/src/_buffer_impl.h
index 8f79b48..ac31afa 100644
--- a/src/_buffer_impl.h
+++ b/src/_buffer_impl.h
@@ -29,7 +29,7 @@ class BufferImpl : public Impl {
 
   Buffer getLocal();
   nxs_status copyData(void *_hostBuf, nxs_uint direction) const;
-  nxs_status fillData(float fillValue) const;
+  nxs_status fillData(void *value, size_t size) const;
   std::string print() const;
 
  private:
diff --git a/src/_device_impl.h b/src/_device_impl.h
index 14fd490..27386ef 100644
--- a/src/_device_impl.h
+++ b/src/_device_impl.h
@@ -52,7 +52,7 @@ class DeviceImpl : public Impl {
   Buffer createBuffer(size_t size, const void *data = nullptr,
                       nxs_uint settings = 0);
   Buffer copyBuffer(Buffer buf, nxs_uint settings = 0);
-  Buffer fillBuffer(float value);
+  Buffer fillBuffer(void *value, size_t size);
 
 };
 
diff --git a/src/buffer.cpp b/src/buffer.cpp
index 4f662c3..ec023c8 100644
--- a/src/buffer.cpp
+++ b/src/buffer.cpp
@@ -152,16 +152,14 @@ nxs_status detail::BufferImpl::copyData(void *_hostBuf, nxs_uint direction) cons
   return NXS_Success;
 }
 
-nxs_status detail::BufferImpl::fillData(float value) const {
-  std::cout << ">>> FILL BUFFER CALLED! <<<" << std::endl;
+nxs_status detail::BufferImpl::fillData(void *value, size_t size) const {
   nxs_status return_stat;
   if (nxs_valid_id(getDeviceId())) {
     NEXUS_LOG(NXS_LOG_NOTE, "fillData: on device: ", getSize());
     auto *rt = getParentOfType<RuntimeImpl>();
-    return_stat = (nxs_status)rt->runAPIFunction<NF_nxsFillBuffer>(getId(), &value);
+    return_stat = (nxs_status)rt->runAPIFunction<NF_nxsFillBuffer>(getId(), value, size);
   }
   NEXUS_LOG(NXS_LOG_NOTE, "fillData: on host: ", getSize());
-  memset((void *)getData(), value, getSize());
   return return_stat;
 }
 
@@ -190,4 +188,4 @@ Buffer Buffer::getLocal() const {
 }
 
 nxs_status Buffer::copy(void *_hostBuf, nxs_uint direction) { NEXUS_OBJ_MCALL(NXS_InvalidBuffer, copyData, _hostBuf, direction); }
-nxs_status Buffer::fill(float fillValue) { NEXUS_OBJ_MCALL(NXS_InvalidBuffer, fillData, fillValue); }
\ No newline at end of file
+nxs_status Buffer::fill(void *value, size_t size) { NEXUS_OBJ_MCALL(NXS_InvalidBuffer, fillData, value, size); }
\ No newline at end of file
diff --git a/test/cpp/test_basic_kernel.cpp b/test/cpp/test_basic_kernel.cpp
index 20012e2..65fcf02 100644
--- a/test/cpp/test_basic_kernel.cpp
+++ b/test/cpp/test_basic_kernel.cpp
@@ -67,7 +67,7 @@ int test_basic_kernel(int argc, char** argv) {
   auto buf0 = dev0.createBuffer(size, vecA.data());
   auto buf1 = dev0.createBuffer(size, vecB.data());
   auto buf2 = dev0.createBuffer(size, vecResult_GPU.data());
-  
+
   auto stream0 = dev0.createStream();
 
   auto sched = dev0.createSchedule();
diff --git a/test/cpp/test_buffers.cpp b/test/cpp/test_buffers.cpp
new file mode 100644
index 0000000..1d0d6b3
--- /dev/null
+++ b/test/cpp/test_buffers.cpp
@@ -0,0 +1,54 @@
+#include <gtest/gtest.h>
+#include <nexus.h>
+#include <vector>
+
+#define SUCCESS 0
+#define FAILURE 1
+
+// DEFINITIONS (Remove 'extern' here so the linker allocates space)
+int g_argc;
+char** g_argv;
+
+int test_direct_buffer_fill(std::string runtime_name) {
+    auto sys = nexus::getSystem();
+    auto runtime = sys.getRuntime(runtime_name);
+    if (!runtime || runtime.getDevices().empty()) return FAILURE;
+    
+    auto dev = runtime.getDevice(0);
+    
+    // // Using a 6-byte pattern to be thorough
+    std::vector<uint8_t> pattern = {0xDE, 0xAD, 0xBE, 0xEF, 0xCA, 0xFE};
+    size_t buffer_size = 1024; 
+    
+    auto buf = dev.createBuffer(buffer_size, nullptr);
+
+    buf.fill(pattern.data(), pattern.size());
+
+    // Verify
+    std::vector<uint8_t> host_out(buffer_size);
+    buf.copy(host_out.data(), NXS_BufferDeviceToHost);
+
+    for (size_t i = 0; i < buffer_size; ++i) {
+        if (host_out[i] != pattern[i % pattern.size()]) {
+            return FAILURE;
+        }
+    }
+
+    return SUCCESS;
+}
+
+class BufferTest : public ::testing::Test {};
+
+TEST_F(BufferTest, DIRECT_FILL) {
+    // Access the now-defined global variables
+    std::string runtime_name = (g_argc > 1) ? g_argv[1] : "cuda";
+    EXPECT_EQ(test_direct_buffer_fill(runtime_name), SUCCESS);
+}
+
+// Ensure your main function actually sets these globals
+int main(int argc, char** argv) {
+    g_argc = argc;
+    g_argv = argv;
+    ::testing::InitGoogleTest(&argc, argv);
+    return RUN_ALL_TESTS();
+}
\ No newline at end of file