libmir · badnikhil · May 13, 2026 · May 13, 2026 · May 13, 2026 · May 13, 2026
diff --git a/.gitignore b/.gitignore
@@ -17,7 +17,7 @@
 # Compiled Static libraries
 *.a
 *.lib
-
+__dummy_docs
 # Executables
 *.exe
 

diff --git a/docs/01-installation.md b/docs/01-installation.md
@@ -24,8 +24,33 @@ If you are not using dub DCompute has a few of dependencies that you need to
 include:
 
 * [derelict-cl](https://github.com/DerelictOrg/DerelictCL) for OpenCL bindings
-* [derelict-cuda](https://github.com/DerelictOrg/DerelictCUDA) for CUDA bindings
-* [derelict-util](https://github.com/DerelictOrg/DerelictUtil) shared library loading utilities used by the above
+* [bindbc-cuda](https://github.com/BindBC/bindbc-cuda) for CUDA bindings
+* [derelict-util](https://github.com/DerelictOrg/DerelictUtil) shared library loading utilities used by derelict-cl
+
+Configuring bindbc-cuda
+-----------------------
+
+Unlike the previous Derelict bindings, `bindbc-cuda` requires you to specify which
+CUDA Driver API version to target via a D version flag in your `dub.json`.
+This controls which host-side CUDA functions (e.g. `cuMemPrefetchAsync`) are available.
+
+Add the appropriate version to your `dub.json` configuration:
+
+```json
+"versions": ["CUDA_120"]
+```
+
+Supported version flags: `CUDA_100`, `CUDA_101`, `CUDA_102`, `CUDA_110`, `CUDA_111`,
+`CUDA_112`, `CUDA_118`, `CUDA_120`, `CUDA_122`, `CUDA_124`, `CUDA_130`, `CUDA_132`.
+
+If no version flag is specified, `bindbc-cuda` defaults to `CUDA_100` (CUDA 10.0).
+Choose the version that matches the CUDA toolkit installed on your system — you can
+check yours by running `nvcc --version`.
+
+**Note:** This version flag is independent of the LDC `-mdcompute-targets` flag.
+The `dflags` target (e.g. `cuda-210`) controls which GPU hardware architecture
+LDC generates PTX code for, while the `versions` flag controls which driver API
+functions are available on the host side.
 
 Drivers
 -------

diff --git a/dub.json b/dub.json
@@ -6,7 +6,7 @@
     "license": "BSL-1.0",
     "dependencies": {
         "derelict-cl"  : "~>3.2.0",
-        "derelict-cuda": "~>3.1.1",
+        "bindbc-cuda": "~>0.1.0",
         "taggedalgebraic": "~>0.10.7"
     },
     "configurations": [

diff --git a/source/dcompute/driver/cuda/context.d b/source/dcompute/driver/cuda/context.d
@@ -4,7 +4,7 @@ import dcompute.driver.cuda;
 
 struct Context
 {
-    void* raw;
+    CUcontext raw;
     this(Device dev, uint flags = 0)
     {
         status = cast(Status)cuCtxCreate(&raw, flags,dev.raw);
@@ -62,7 +62,7 @@ struct Context
     static @property size_t limit(Limit what)()
     {
         size_t ret;
-        status = cast(Status)cuCtxSetLimit(&ret,what);
+        status = cast(Status)cuCtxGetLimit(&ret,what);
         checkErrors();
         return ret;
     }

diff --git a/source/dcompute/driver/cuda/device.d b/source/dcompute/driver/cuda/device.d
@@ -115,6 +115,7 @@ struct Device
             ` () { int result; `,
             ` status = cast(Status)cuDeviceGetAttribute( `,
             ` &result, `,
+            ` cast(CUdevice_attribute) `,
              __traits(getAttributes, __traits(getMember, Info, mem))[0].stringof,
             `, raw); `,
             ` checkErrors(); `,

diff --git a/source/dcompute/driver/cuda/event.d b/source/dcompute/driver/cuda/event.d
@@ -4,6 +4,6 @@ import dcompute.driver.cuda;
 
 struct Event
 {
-    void* raw;
+    CUevent raw;
 
 }
diff --git a/source/dcompute/driver/cuda/kernel.d b/source/dcompute/driver/cuda/kernel.d
@@ -3,7 +3,7 @@ module dcompute.driver.cuda.kernel;
 import dcompute.driver.cuda;
 struct Kernel(F) if (is(F==function)|| is(F==void))
 {
-    void* raw;
+    CUfunction raw;
 
     static struct Attributes
     {

diff --git a/source/dcompute/driver/cuda/package.d b/source/dcompute/driver/cuda/package.d
@@ -1,7 +1,7 @@
 module dcompute.driver.cuda;
 
 public import ldc.dcompute;
-public import derelict.cuda.driverapi;
+public import bindbc.cuda;
 
 public import dcompute.driver.error;
 

diff --git a/source/dcompute/driver/cuda/platform.d b/source/dcompute/driver/cuda/platform.d
@@ -8,7 +8,12 @@ struct Platform
 {
     static void initialise(uint flags =0)
     {
-        DerelictCUDADriver.load();
+        auto support = loadCUDA();
+        if (support == CUDASupport.noLibrary || support == CUDASupport.badLibrary)
+        {
+            status = Status.sharedObjectInitFailed;
+            checkErrors();
+        }
         status = cast(Status)cuInit(flags);
         checkErrors();
     }

diff --git a/source/dcompute/driver/cuda/program.d b/source/dcompute/driver/cuda/program.d
@@ -5,7 +5,7 @@ import dcompute.driver.cuda;
 import std.string;
 struct Program
 {
-    void* raw;
+    CUmodule raw;
 
     Kernel!void getKernelByName(immutable(char)* name)
     {

diff --git a/source/dcompute/driver/cuda/queue.d b/source/dcompute/driver/cuda/queue.d
@@ -4,7 +4,7 @@ module dcompute.driver.cuda.queue;
 import dcompute.driver.cuda;
 struct Queue
 {
-    void* raw;
+    CUstream raw;
     this (bool async)
     {
         status = cast(Status)cuStreamCreate(&raw, async ? 1 : 0);

diff --git a/source/dcompute/driver/cuda/unified_buffer.d b/source/dcompute/driver/cuda/unified_buffer.d
@@ -6,17 +6,10 @@
  * migrates data automatically, so explicit copy!(Copy.hostToDevice) /
  * copy!(Copy.deviceToHost) calls are not needed.
  *
+ *
  * Requirements:
  *   - CUDA Compute Capability >= 3.0
  *   - Device.supportsUnifiedMemory == true
- *
- * Limitations (current):
- *   - prefetch() is a documented no-op stub because cuMemPrefetchAsync is not
- *     present in the derelict-cuda 3.1.1 binding (API version 6.5). If you
- *     need deterministic placement before a kernel launch, call
- *     Context.sync() after writing from the host and before the kernel; the
- *     driver will migrate pages on first access otherwise.
- *     (Tracked for discussion with mentors)
  */
 module dcompute.driver.cuda.unified_buffer;
 
@@ -100,24 +93,28 @@ struct UnifiedBuffer(T)
     @property size_t length() const { return _length; }
 
 
-    // Device-side hints (stubs pending driver-version upgrade)
+    // Device-side hints
 
     /**
      * Prefetch this buffer's data to a device asynchronously.
      *
-     * NOTE: This is currently a **no-op stub**. cuMemPrefetchAsync is not
-     * present in the derelict-cuda 3.1.1 binding (CUDA API 6.5). Without an
-     * explicit prefetch the CUDA runtime will migrate pages on first access,
-     * which is correct but may cause latency on the first kernel invocation.
+     * Initiates memory migration to the specified device prior to kernel execution
+     * to avoid on-demand page migration latency.
      *
-     * As a workaround, call Context.sync() on the host before launching the
-     * kernel to ensure all host writes have completed; the driver will then
-     * migrate on first GPU access.
+     * Note: Explicit prefetching requires CUDA 8.0 or higher. On older drivers,
+     * this operation is not supported and will result in a Status.notSupported error.
      */
-    void prefetch(Device dev, Queue q = Queue.init)
+    @trusted void prefetch(Device dev, Queue q = Queue.init)
     {
-        // Stub — intentionally left empty.
-        // See module documentation for rationale.
+        if (cuMemPrefetchAsync == null)
+        {
+            status = Status.notSupported;
+        }
+        else
+        {
+            status = cast(Status)cuMemPrefetchAsync(cast(CUdeviceptr)raw, _length * T.sizeof, dev.raw, q.raw);
+        }
+        checkErrors();
     }
-Original file line number
+Diff line change
@@ Expand Up / @@ -17,7 +17,7 @@ @@
     # Compiled Static libraries
     *.a
     *.lib
+    __dummy_docs
     # Executables
     *.exe
@@ Expand Down @@