Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions include/nexus-api/_nxs_functions.h
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,24 @@ NEXUS_API_FUNC(nxs_status, CopyBuffer,
void* host_ptr,
nxs_uint buffer_settings
)

NEXUS_API_FUNC(nxs_status, ReshapeBuffer,
nxs_int buffer_id,
int *new_shape,
int ndims
)

/************************************************************************
* @def FillBuffer
* @brief Fill buffer on the device with a value
* @return Negative value is an error status.
* Non-negative is the bufferId.
***********************************************************************/
NEXUS_API_FUNC(nxs_status, FillBuffer,
nxs_int buffer_id,
const void* value
)

/************************************************************************
* @def ReleaseBuffer
* @brief Release the buffer on the device
Expand Down
11 changes: 11 additions & 0 deletions include/nexus-api/nxs.h
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,17 @@ enum _nxs_buffer_settings {
};
typedef enum _nxs_buffer_settings nxs_buffer_settings;

/* ENUM _nxs_buffer_transfer */
/*
* NXS_BufferDeviceToHost:
* - Copy buffer from device to host
* NXS_BufferHostToDevice:
* - Copy buffer from host to device
*/
enum _nxs_buffer_transfer {
NXS_BufferDeviceToHost = 0,
NXS_BufferHostToDevice = 1,
};

/********************************************************************************************************/
/* Constants */
Expand Down
7 changes: 6 additions & 1 deletion include/nexus/buffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ class Buffer : public Object<detail::BufferImpl> {
Buffer(detail::Impl base, size_t _sz, const void *_hostData = nullptr);
Buffer(detail::Impl base, nxs_int devId, size_t _sz,
const void *_deviceData = nullptr);
Buffer(detail::Impl base, nxs_int devId, std::vector<nxs_int> shape,
const void *_deviceData = nullptr);
using Object::Object;

nxs_int getDeviceId() const;
Expand All @@ -33,7 +35,10 @@ class Buffer : public Object<detail::BufferImpl> {

Buffer getLocal() const;

nxs_status copy(void *_hostBuf);
nxs_status copy(void *_hostBuf, nxs_uint direction = NXS_BufferDeviceToHost);

nxs_status reshape(std::vector<nxs_int> new_shape);
nxs_status fill(float value);
};

typedef Objects<Buffer> Buffers;
Expand Down
4 changes: 4 additions & 0 deletions include/nexus/device.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,11 @@ class Device : public Object<detail::DeviceImpl> {

Buffer createBuffer(size_t size, const void *data = nullptr,
nxs_uint settings = 0);
Buffer createBuffer(std::vector<nxs_int> shape, const void *data = nullptr,
nxs_uint settings = 0);
Buffer copyBuffer(Buffer buf, nxs_uint settings = 0);
Buffer reshapeBuffer(Buffer buf, std::vector<nxs_int> new_shape);
Buffer fillBuffer(const void *value);
};

typedef Objects<Device> Devices;
Expand Down
48 changes: 46 additions & 2 deletions plugins/cuda/cuda_runtime.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -236,12 +236,56 @@ extern "C" nxs_status NXS_API_CALL nxsCopyBuffer(nxs_int buffer_id,
auto buffer = rt->get<rt::Buffer>(buffer_id);
if (!buffer) return NXS_InvalidBuffer;
if (!host_ptr) return NXS_InvalidHostPtr;
if (copy_settings == NXS_BufferDeviceToHost)
CUDA_CHECK(NXS_InvalidBuffer, cudaMemcpy, host_ptr, buffer->get(),
buffer->size(), cudaMemcpyDeviceToHost);
else
CUDA_CHECK(NXS_InvalidBuffer, cudaMemcpy, buffer->get(), host_ptr,
buffer->size(), cudaMemcpyHostToDevice);
return NXS_Success;
}

extern "C" nxs_status NXS_API_CALL nxsReshapeBuffer(nxs_int buffer_id, int *new_shape, int ndims) {
auto rt = getRuntime();

auto buffer = rt->get<rt::Buffer>(buffer_id);
if (!buffer) return NXS_InvalidBuffer;

std::vector<int> shape(new_shape, new_shape + ndims);

CUDA_CHECK(NXS_InvalidBuffer, cudaMemcpy, host_ptr, buffer->get(),
buffer->size(), cudaMemcpyDeviceToHost);
buffer->setShape(shape);
return NXS_Success;
}

// Add the 'const' to match the header
extern "C" nxs_status NXS_API_CALL nxsFillBuffer(nxs_int buffer_id, const void *fill_value) {
// 1. Get the Nexus buffer object
auto rt = getRuntime();
auto buffer = rt->get<rt::Buffer>(buffer_id);
if (!buffer) return NXS_InvalidBuffer;

// 2. Properly extract the float value
// We cast the generic pointer to a float pointer, then dereference.
float val = *static_cast<const float*>(fill_value);

// 3. The "Inefficient" but Reliable Method:
// Calculate how many floats we need to fill the allocated space
size_t num_elements = buffer->size() / sizeof(float);

// Create a temporary host buffer and fill it using the CPU
std::vector<float> host_gold_standard(num_elements, val);

// 4. Blast the filled buffer to the Device
// This bypasses the cudaMemset byte-smearing problem entirely.
CUDA_CHECK(NXS_InvalidBuffer, cudaMemcpy,
buffer->get(), // Destination (Device)
host_gold_standard.data(), // Source (Host)
buffer->size(),
cudaMemcpyHostToDevice);

return NXS_Success;
}

/*
* Release a buffer on the device.
*/
Expand Down
3 changes: 2 additions & 1 deletion plugins/include/rt_buffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ class Buffer {
char *buf;
size_t sz;
nxs_uint settings;

std::vector<int> shape;
public:
Buffer(size_t size = 0, void *data_ptr = nullptr, nxs_uint settings = 0)
: buf((char *)data_ptr), sz(size), settings(settings) {
Expand Down Expand Up @@ -51,6 +51,7 @@ class Buffer {
T *get() {
return reinterpret_cast<T *>(buf);
}
void setShape(std::vector<int>new_shape) {shape = new_shape;}
};

} // namespace rt
Expand Down
3 changes: 2 additions & 1 deletion scripts/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,15 @@ main() {
make -j$(nproc)

printf "Running CPU tests"
./test/cpp/test_basic_kernel cpu kernel_libs/cpu_kernel.so add_vectors
#./test/cpp/test_basic_kernel cpu kernel_libs/cpu_kernel.so add_vectors

if [[ "$os_type" == "macos" ]]; then
printf "Running macOS test"
#./test/cpp/gpu/nexus_gpu_integration_test metal metal_kernels/kernel.metallib add_vectors

elif [[ "$os_type" == "linux" ]]; then
printf "Running Linux test"
./test/cpp/test_buffers cuda
./test/cpp/test_basic_kernel cuda kernel_libs/add_vectors.ptx add_vectors
./test/cpp/test_kernel_catalog cuda kernel_libs/add_vectors.kc add_vectors
./test/cpp/test_smi cuda kernel_libs/add_vectors.ptx add_vectors
Expand Down
7 changes: 5 additions & 2 deletions src/_buffer_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ class BufferImpl : public Impl {
public:
BufferImpl(Impl base, size_t _sz, const char *_hostData);
BufferImpl(Impl base, nxs_int _devId, size_t _sz, const char *_hostData);
BufferImpl(Impl base, nxs_int _devId, std::vector<nxs_int> shape, const char *_hostData);

~BufferImpl();

Expand All @@ -28,8 +29,9 @@ class BufferImpl : public Impl {
void setData(void *_data) { data = _data; }

Buffer getLocal();
nxs_status copyData(void *_hostBuf) const;

nxs_status copyData(void *_hostBuf, nxs_uint direction) const;
nxs_status reshape(std::vector<nxs_int> new_shape);
nxs_status fillData(float fillValue) const;
std::string print() const;

private:
Expand All @@ -40,6 +42,7 @@ class BufferImpl : public Impl {
// set of runtimes
nxs_int deviceId;
size_t size;
std::vector<nxs_int> shape;
void *data;
};
} // namespace detail
Expand Down
5 changes: 5 additions & 0 deletions src/_device_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,12 @@ class DeviceImpl : public Impl {

Buffer createBuffer(size_t size, const void *data = nullptr,
nxs_uint settings = 0);
Buffer createBuffer(std::vector<nxs_int> shape, const void *data = nullptr,
nxs_uint settings = 0);
Buffer copyBuffer(Buffer buf, nxs_uint settings = 0);
Buffer reshapeBuffer(Buffer buf, std::vector<nxs_int> new_shape);
Buffer fillBuffer(float value);

};

} // namespace detail
Expand Down
3 changes: 3 additions & 0 deletions src/_system_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ class SystemImpl : public detail::Impl {
}
Buffer createBuffer(size_t sz, const void *hostData = nullptr,
nxs_uint options = 0);
Buffer createBuffer(std::vector<nxs_int> shape, const void *hostData = nullptr,
nxs_uint options = 0);

Buffer copyBuffer(Buffer buf, Device dev, nxs_uint options = 0);
Info loadCatalog(const std::string &catalogPath);

Expand Down
39 changes: 36 additions & 3 deletions src/buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include <nexus/system.h>

#include <cstring>
#include <numeric>

#include "_buffer_impl.h"
#include "_runtime_impl.h"
Expand All @@ -25,6 +26,14 @@ detail::BufferImpl::BufferImpl(detail::Impl base, nxs_int _devId, size_t _sz,
setData(_sz, _hostData);
}

detail::BufferImpl::BufferImpl(detail::Impl base, nxs_int _devId, std::vector<nxs_int> shape,
const char *_hostData)
: Impl(base), deviceId(_devId), size(0), shape(std::move(shape)), data(nullptr) {
size_t totalSize = 1;
for (auto dim : shape) totalSize *= dim;
setData(totalSize, _hostData);
}

detail::BufferImpl::~BufferImpl() { release(); }

void detail::BufferImpl::release() {
Expand Down Expand Up @@ -140,25 +149,47 @@ Buffer detail::BufferImpl::getLocal() {
return Buffer();
}

nxs_status detail::BufferImpl::copyData(void *_hostBuf) const {
nxs_status detail::BufferImpl::copyData(void *_hostBuf, nxs_uint direction) const {
if (nxs_valid_id(getDeviceId())) {
NEXUS_LOG(NXS_LOG_NOTE, "copyData: from device: ", getSize());
auto *rt = getParentOfType<RuntimeImpl>();
return (nxs_status)rt->runAPIFunction<NF_nxsCopyBuffer>(getId(), _hostBuf,
0);
direction);
}
NEXUS_LOG(NXS_LOG_NOTE, "copyData: from host: ", getSize());
memcpy(_hostBuf, getData(), getSize());
return NXS_Success;
}

nxs_status detail::BufferImpl::reshape(std::vector<nxs_int> new_shape) {
auto *rt = getParentOfType<RuntimeImpl>();
return (nxs_status)rt->runAPIFunction<NF_nxsReshapeBuffer>(getId(), new_shape.data(), new_shape.size());
}

nxs_status detail::BufferImpl::fillData(float value) const {
nxs_status return_stat;
if (nxs_valid_id(getDeviceId())) {
NEXUS_LOG(NXS_LOG_NOTE, "fillData: on device: ", getSize());
auto *rt = getParentOfType<RuntimeImpl>();
return_stat = (nxs_status)rt->runAPIFunction<NF_nxsFillBuffer>(getId(), &value);
}
NEXUS_LOG(NXS_LOG_NOTE, "fillData: on host: ", getSize());
memset((void *)getData(), value, getSize());
return return_stat;
}

///////////////////////////////////////////////////////////////////////////////
Buffer::Buffer(detail::Impl base, size_t _sz, const void *_hostData)
: Object(base, _sz, (const char *)_hostData) {}

Buffer::Buffer(detail::Impl base, nxs_int _devId, size_t _sz, const void *_hostData)
: Object(base, _devId, _sz, (const char *)_hostData) {}

Buffer::Buffer(detail::Impl base, nxs_int _devId, std::vector<nxs_int> shape,
const void *_hostData) : Object(base, _devId,
std::accumulate(shape.begin(), shape.end(), (size_t)1, std::multiplies<size_t>()) * sizeof(float),
(const char *)_hostData) { NEXUS_OBJ_MCALL_VOID(reshape, shape); }

nxs_int Buffer::getDeviceId() const { NEXUS_OBJ_MCALL(NXS_InvalidBuffer, getDeviceId); }

std::optional<Property> Buffer::getProperty(nxs_int prop) const {
Expand All @@ -176,4 +207,6 @@ Buffer Buffer::getLocal() const {
return get()->getLocal();
}

nxs_status Buffer::copy(void *_hostBuf) { NEXUS_OBJ_MCALL(NXS_InvalidBuffer, copyData, _hostBuf); }
nxs_status Buffer::copy(void *_hostBuf, nxs_uint direction) { NEXUS_OBJ_MCALL(NXS_InvalidBuffer, copyData, _hostBuf, direction); }
nxs_status Buffer::reshape(std::vector<nxs_int> new_shape) {NEXUS_OBJ_MCALL(NXS_InvalidBuffer, reshape, new_shape);}
nxs_status Buffer::fill(float fillValue) { NEXUS_OBJ_MCALL(NXS_InvalidBuffer, fillData, fillValue); }
21 changes: 21 additions & 0 deletions src/device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,17 @@ Buffer detail::DeviceImpl::createBuffer(size_t size, const void *data,
return nbuf;
}

Buffer detail::DeviceImpl::createBuffer(std::vector<nxs_int> shape, const void *data,
nxs_uint settings) {
NEXUS_LOG(NXS_LOG_NOTE, " createBuffer with shape");
size_t totalSize = 1;
for (auto dim : shape) totalSize *= dim;
APICALL(nxsCreateBuffer, getId(), totalSize, (void *)data, settings);
Buffer nbuf(Impl(this, apiResult, settings), getId(), shape, data);
buffers.add(nbuf);
return nbuf;
}

Buffer detail::DeviceImpl::copyBuffer(Buffer buf, nxs_uint settings) {
NEXUS_LOG(NXS_LOG_NOTE, " copyBuffer");
settings |= buf.getSettings() & ~NXS_BufferSettings_OnDevice;
Expand All @@ -174,6 +185,12 @@ Buffer detail::DeviceImpl::copyBuffer(Buffer buf, nxs_uint settings) {
return nbuf;
}

Buffer detail::DeviceImpl::reshapeBuffer(Buffer buf, std::vector<nxs_int> new_shape) {
NEXUS_LOG(NXS_LOG_NOTE, " reshapeBuffer");
APICALL(nxsReshapeBuffer, buf.getId(), new_shape.data(), new_shape.size());
return buf;
}

///////////////////////////////////////////////////////////////////////////////
/// Object wrapper - Device
///////////////////////////////////////////////////////////////////////////////
Expand Down Expand Up @@ -213,6 +230,10 @@ Buffer Device::createBuffer(size_t size, const void *data, nxs_uint settings) {
NEXUS_OBJ_MCALL(Buffer(), createBuffer, size, data, settings);
}

Buffer Device::createBuffer(std::vector<nxs_int> shape, const void *data, nxs_uint settings) {
NEXUS_OBJ_MCALL(Buffer(), createBuffer, shape, data, settings);
}

Buffer Device::copyBuffer(Buffer buf, nxs_uint settings) {
NEXUS_OBJ_MCALL(Buffer(), copyBuffer, buf, settings);
}
Expand Down
4 changes: 2 additions & 2 deletions test/cpp/test_basic_kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ int test_basic_kernel(int argc, char** argv) {
auto buf0 = dev0.createBuffer(size, vecA.data());
auto buf1 = dev0.createBuffer(size, vecB.data());
auto buf2 = dev0.createBuffer(size, vecResult_GPU.data());

auto stream0 = dev0.createStream();

auto sched = dev0.createSchedule();
Expand All @@ -84,7 +84,7 @@ int test_basic_kernel(int argc, char** argv) {
auto time_ms = sched.getProp<nxs_double>(NP_ElapsedTime);
std::cout << "Elapsed time: " << time_ms << std::endl;

buf2.copy(vecResult_GPU.data());
buf2.copy(vecResult_GPU.data(), NXS_BufferDeviceToHost);

int i = 0;
for (auto v : vecResult_GPU) {
Expand Down
Loading