diff --git a/.gitignore b/.gitignore index 5347b19..d9c8c49 100644 --- a/.gitignore +++ b/.gitignore @@ -17,7 +17,7 @@ # Compiled Static libraries *.a *.lib - +__dummy_docs # Executables *.exe diff --git a/docs/01-installation.md b/docs/01-installation.md index 2ba6a42..f461ae7 100644 --- a/docs/01-installation.md +++ b/docs/01-installation.md @@ -24,8 +24,33 @@ If you are not using dub DCompute has a few of dependencies that you need to include: * [derelict-cl](https://github.com/DerelictOrg/DerelictCL) for OpenCL bindings -* [derelict-cuda](https://github.com/DerelictOrg/DerelictCUDA) for CUDA bindings -* [derelict-util](https://github.com/DerelictOrg/DerelictUtil) shared library loading utilities used by the above +* [bindbc-cuda](https://github.com/BindBC/bindbc-cuda) for CUDA bindings +* [derelict-util](https://github.com/DerelictOrg/DerelictUtil) shared library loading utilities used by derelict-cl + +Configuring bindbc-cuda +----------------------- + +Unlike the previous Derelict bindings, `bindbc-cuda` requires you to specify which +CUDA Driver API version to target via a D version flag in your `dub.json`. +This controls which host-side CUDA functions (e.g. `cuMemPrefetchAsync`) are available. + +Add the appropriate version to your `dub.json` configuration: + +```json +"versions": ["CUDA_120"] +``` + +Supported version flags: `CUDA_100`, `CUDA_101`, `CUDA_102`, `CUDA_110`, `CUDA_111`, +`CUDA_112`, `CUDA_118`, `CUDA_120`, `CUDA_122`, `CUDA_124`, `CUDA_130`, `CUDA_132`. + +If no version flag is specified, `bindbc-cuda` defaults to `CUDA_100` (CUDA 10.0). +Choose the version that matches the CUDA toolkit installed on your system — you can +check yours by running `nvcc --version`. + +**Note:** This version flag is independent of the LDC `-mdcompute-targets` flag. +The `dflags` target (e.g. `cuda-210`) controls which GPU hardware architecture +LDC generates PTX code for, while the `versions` flag controls which driver API +functions are available on the host side. Drivers ------- diff --git a/dub.json b/dub.json index 25af6b3..3154710 100644 --- a/dub.json +++ b/dub.json @@ -6,7 +6,7 @@ "license": "BSL-1.0", "dependencies": { "derelict-cl" : "~>3.2.0", - "derelict-cuda": "~>3.1.1", + "bindbc-cuda": "~>0.1.0", "taggedalgebraic": "~>0.10.7" }, "configurations": [ diff --git a/source/dcompute/driver/cuda/context.d b/source/dcompute/driver/cuda/context.d index fca500b..f879885 100644 --- a/source/dcompute/driver/cuda/context.d +++ b/source/dcompute/driver/cuda/context.d @@ -4,7 +4,7 @@ import dcompute.driver.cuda; struct Context { - void* raw; + CUcontext raw; this(Device dev, uint flags = 0) { status = cast(Status)cuCtxCreate(&raw, flags,dev.raw); @@ -62,7 +62,7 @@ struct Context static @property size_t limit(Limit what)() { size_t ret; - status = cast(Status)cuCtxSetLimit(&ret,what); + status = cast(Status)cuCtxGetLimit(&ret,what); checkErrors(); return ret; } diff --git a/source/dcompute/driver/cuda/device.d b/source/dcompute/driver/cuda/device.d index 46dd563..9483435 100644 --- a/source/dcompute/driver/cuda/device.d +++ b/source/dcompute/driver/cuda/device.d @@ -115,6 +115,7 @@ struct Device ` () { int result; `, ` status = cast(Status)cuDeviceGetAttribute( `, ` &result, `, + ` cast(CUdevice_attribute) `, __traits(getAttributes, __traits(getMember, Info, mem))[0].stringof, `, raw); `, ` checkErrors(); `, diff --git a/source/dcompute/driver/cuda/event.d b/source/dcompute/driver/cuda/event.d index d581d2c..3cc1367 100644 --- a/source/dcompute/driver/cuda/event.d +++ b/source/dcompute/driver/cuda/event.d @@ -4,6 +4,6 @@ import dcompute.driver.cuda; struct Event { - void* raw; + CUevent raw; } diff --git a/source/dcompute/driver/cuda/kernel.d b/source/dcompute/driver/cuda/kernel.d index da403d6..8b07b8b 100644 --- a/source/dcompute/driver/cuda/kernel.d +++ b/source/dcompute/driver/cuda/kernel.d @@ -3,7 +3,7 @@ module dcompute.driver.cuda.kernel; import dcompute.driver.cuda; struct Kernel(F) if (is(F==function)|| is(F==void)) { - void* raw; + CUfunction raw; static struct Attributes { diff --git a/source/dcompute/driver/cuda/package.d b/source/dcompute/driver/cuda/package.d index 7dd2dfe..6c9ac93 100644 --- a/source/dcompute/driver/cuda/package.d +++ b/source/dcompute/driver/cuda/package.d @@ -1,7 +1,7 @@ module dcompute.driver.cuda; public import ldc.dcompute; -public import derelict.cuda.driverapi; +public import bindbc.cuda; public import dcompute.driver.error; diff --git a/source/dcompute/driver/cuda/platform.d b/source/dcompute/driver/cuda/platform.d index 17e12a7..a9bba17 100644 --- a/source/dcompute/driver/cuda/platform.d +++ b/source/dcompute/driver/cuda/platform.d @@ -8,7 +8,12 @@ struct Platform { static void initialise(uint flags =0) { - DerelictCUDADriver.load(); + auto support = loadCUDA(); + if (support == CUDASupport.noLibrary || support == CUDASupport.badLibrary) + { + status = Status.sharedObjectInitFailed; + checkErrors(); + } status = cast(Status)cuInit(flags); checkErrors(); } diff --git a/source/dcompute/driver/cuda/program.d b/source/dcompute/driver/cuda/program.d index 25191ac..86d9ca0 100644 --- a/source/dcompute/driver/cuda/program.d +++ b/source/dcompute/driver/cuda/program.d @@ -5,7 +5,7 @@ import dcompute.driver.cuda; import std.string; struct Program { - void* raw; + CUmodule raw; Kernel!void getKernelByName(immutable(char)* name) { diff --git a/source/dcompute/driver/cuda/queue.d b/source/dcompute/driver/cuda/queue.d index 774aedf..09e9f6b 100644 --- a/source/dcompute/driver/cuda/queue.d +++ b/source/dcompute/driver/cuda/queue.d @@ -4,7 +4,7 @@ module dcompute.driver.cuda.queue; import dcompute.driver.cuda; struct Queue { - void* raw; + CUstream raw; this (bool async) { status = cast(Status)cuStreamCreate(&raw, async ? 1 : 0); diff --git a/source/dcompute/driver/cuda/unified_buffer.d b/source/dcompute/driver/cuda/unified_buffer.d index d57a3e2..a5ea664 100644 --- a/source/dcompute/driver/cuda/unified_buffer.d +++ b/source/dcompute/driver/cuda/unified_buffer.d @@ -6,17 +6,10 @@ * migrates data automatically, so explicit copy!(Copy.hostToDevice) / * copy!(Copy.deviceToHost) calls are not needed. * + * * Requirements: * - CUDA Compute Capability >= 3.0 * - Device.supportsUnifiedMemory == true - * - * Limitations (current): - * - prefetch() is a documented no-op stub because cuMemPrefetchAsync is not - * present in the derelict-cuda 3.1.1 binding (API version 6.5). If you - * need deterministic placement before a kernel launch, call - * Context.sync() after writing from the host and before the kernel; the - * driver will migrate pages on first access otherwise. - * (Tracked for discussion with mentors) */ module dcompute.driver.cuda.unified_buffer; @@ -100,24 +93,28 @@ struct UnifiedBuffer(T) @property size_t length() const { return _length; } - // Device-side hints (stubs pending driver-version upgrade) + // Device-side hints /** * Prefetch this buffer's data to a device asynchronously. * - * NOTE: This is currently a **no-op stub**. cuMemPrefetchAsync is not - * present in the derelict-cuda 3.1.1 binding (CUDA API 6.5). Without an - * explicit prefetch the CUDA runtime will migrate pages on first access, - * which is correct but may cause latency on the first kernel invocation. + * Initiates memory migration to the specified device prior to kernel execution + * to avoid on-demand page migration latency. * - * As a workaround, call Context.sync() on the host before launching the - * kernel to ensure all host writes have completed; the driver will then - * migrate on first GPU access. + * Note: Explicit prefetching requires CUDA 8.0 or higher. On older drivers, + * this operation is not supported and will result in a Status.notSupported error. */ - void prefetch(Device dev, Queue q = Queue.init) + @trusted void prefetch(Device dev, Queue q = Queue.init) { - // Stub — intentionally left empty. - // See module documentation for rationale. + if (cuMemPrefetchAsync == null) + { + status = Status.notSupported; + } + else + { + status = cast(Status)cuMemPrefetchAsync(cast(CUdeviceptr)raw, _length * T.sizeof, dev.raw, q.raw); + } + checkErrors(); }