Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
# Compiled Static libraries
*.a
*.lib

__dummy_docs
# Executables
*.exe

Expand Down
29 changes: 27 additions & 2 deletions docs/01-installation.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,33 @@ If you are not using dub DCompute has a few of dependencies that you need to
include:

* [derelict-cl](https://github.com/DerelictOrg/DerelictCL) for OpenCL bindings
* [derelict-cuda](https://github.com/DerelictOrg/DerelictCUDA) for CUDA bindings
* [derelict-util](https://github.com/DerelictOrg/DerelictUtil) shared library loading utilities used by the above
* [bindbc-cuda](https://github.com/BindBC/bindbc-cuda) for CUDA bindings
* [derelict-util](https://github.com/DerelictOrg/DerelictUtil) shared library loading utilities used by derelict-cl

Configuring bindbc-cuda
-----------------------

Unlike the previous Derelict bindings, `bindbc-cuda` requires you to specify which
CUDA Driver API version to target via a D version flag in your `dub.json`.
This controls which host-side CUDA functions (e.g. `cuMemPrefetchAsync`) are available.

Add the appropriate version to your `dub.json` configuration:

```json
"versions": ["CUDA_120"]
```

Supported version flags: `CUDA_100`, `CUDA_101`, `CUDA_102`, `CUDA_110`, `CUDA_111`,
`CUDA_112`, `CUDA_118`, `CUDA_120`, `CUDA_122`, `CUDA_124`, `CUDA_130`, `CUDA_132`.

If no version flag is specified, `bindbc-cuda` defaults to `CUDA_100` (CUDA 10.0).
Choose the version that matches the CUDA toolkit installed on your system — you can
check yours by running `nvcc --version`.

**Note:** This version flag is independent of the LDC `-mdcompute-targets` flag.
The `dflags` target (e.g. `cuda-210`) controls which GPU hardware architecture
LDC generates PTX code for, while the `versions` flag controls which driver API
functions are available on the host side.

Drivers
-------
Expand Down
2 changes: 1 addition & 1 deletion dub.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
"license": "BSL-1.0",
"dependencies": {
"derelict-cl" : "~>3.2.0",
"derelict-cuda": "~>3.1.1",
"bindbc-cuda": "~>0.1.0",
"taggedalgebraic": "~>0.10.7"
},
"configurations": [
Expand Down
4 changes: 2 additions & 2 deletions source/dcompute/driver/cuda/context.d
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import dcompute.driver.cuda;

struct Context
{
void* raw;
CUcontext raw;
this(Device dev, uint flags = 0)
{
status = cast(Status)cuCtxCreate(&raw, flags,dev.raw);
Expand Down Expand Up @@ -62,7 +62,7 @@ struct Context
static @property size_t limit(Limit what)()
{
size_t ret;
status = cast(Status)cuCtxSetLimit(&ret,what);
status = cast(Status)cuCtxGetLimit(&ret,what);
checkErrors();
return ret;
}
Expand Down
1 change: 1 addition & 0 deletions source/dcompute/driver/cuda/device.d
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ struct Device
` () { int result; `,
` status = cast(Status)cuDeviceGetAttribute( `,
` &result, `,
` cast(CUdevice_attribute) `,
__traits(getAttributes, __traits(getMember, Info, mem))[0].stringof,
`, raw); `,
` checkErrors(); `,
Expand Down
2 changes: 1 addition & 1 deletion source/dcompute/driver/cuda/event.d
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,6 @@ import dcompute.driver.cuda;

struct Event
{
void* raw;
CUevent raw;

}
2 changes: 1 addition & 1 deletion source/dcompute/driver/cuda/kernel.d
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ module dcompute.driver.cuda.kernel;
import dcompute.driver.cuda;
struct Kernel(F) if (is(F==function)|| is(F==void))
{
void* raw;
CUfunction raw;

static struct Attributes
{
Expand Down
2 changes: 1 addition & 1 deletion source/dcompute/driver/cuda/package.d
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
module dcompute.driver.cuda;

public import ldc.dcompute;
public import derelict.cuda.driverapi;
public import bindbc.cuda;

public import dcompute.driver.error;

Expand Down
7 changes: 6 additions & 1 deletion source/dcompute/driver/cuda/platform.d
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,12 @@ struct Platform
{
static void initialise(uint flags =0)
{
DerelictCUDADriver.load();
auto support = loadCUDA();
if (support == CUDASupport.noLibrary || support == CUDASupport.badLibrary)
{
status = Status.sharedObjectInitFailed;
checkErrors();
}
status = cast(Status)cuInit(flags);
checkErrors();
}
Expand Down
2 changes: 1 addition & 1 deletion source/dcompute/driver/cuda/program.d
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ import dcompute.driver.cuda;
import std.string;
struct Program
{
void* raw;
CUmodule raw;

Kernel!void getKernelByName(immutable(char)* name)
{
Expand Down
2 changes: 1 addition & 1 deletion source/dcompute/driver/cuda/queue.d
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ module dcompute.driver.cuda.queue;
import dcompute.driver.cuda;
struct Queue
{
void* raw;
CUstream raw;
this (bool async)
{
status = cast(Status)cuStreamCreate(&raw, async ? 1 : 0);
Expand Down
35 changes: 16 additions & 19 deletions source/dcompute/driver/cuda/unified_buffer.d
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,10 @@
* migrates data automatically, so explicit copy!(Copy.hostToDevice) /
* copy!(Copy.deviceToHost) calls are not needed.
*
*
* Requirements:
* - CUDA Compute Capability >= 3.0
* - Device.supportsUnifiedMemory == true
*
* Limitations (current):
* - prefetch() is a documented no-op stub because cuMemPrefetchAsync is not
* present in the derelict-cuda 3.1.1 binding (API version 6.5). If you
* need deterministic placement before a kernel launch, call
* Context.sync() after writing from the host and before the kernel; the
* driver will migrate pages on first access otherwise.
* (Tracked for discussion with mentors)
*/
module dcompute.driver.cuda.unified_buffer;

Expand Down Expand Up @@ -100,24 +93,28 @@ struct UnifiedBuffer(T)
@property size_t length() const { return _length; }


// Device-side hints (stubs pending driver-version upgrade)
// Device-side hints

/**
* Prefetch this buffer's data to a device asynchronously.
*
* NOTE: This is currently a **no-op stub**. cuMemPrefetchAsync is not
* present in the derelict-cuda 3.1.1 binding (CUDA API 6.5). Without an
* explicit prefetch the CUDA runtime will migrate pages on first access,
* which is correct but may cause latency on the first kernel invocation.
* Initiates memory migration to the specified device prior to kernel execution
* to avoid on-demand page migration latency.
*
* As a workaround, call Context.sync() on the host before launching the
* kernel to ensure all host writes have completed; the driver will then
* migrate on first GPU access.
* Note: Explicit prefetching requires CUDA 8.0 or higher. On older drivers,
* this operation is not supported and will result in a Status.notSupported error.
*/
void prefetch(Device dev, Queue q = Queue.init)
@trusted void prefetch(Device dev, Queue q = Queue.init)
{
// Stub — intentionally left empty.
// See module documentation for rationale.
if (cuMemPrefetchAsync == null)
{
status = Status.notSupported;
}
else
{
status = cast(Status)cuMemPrefetchAsync(cast(CUdeviceptr)raw, _length * T.sizeof, dev.raw, q.raw);
}
checkErrors();
}


Expand Down