-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathradixsort.cpp
More file actions
81 lines (59 loc) · 2.54 KB
/
radixsort.cpp
File metadata and controls
81 lines (59 loc) · 2.54 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
#include "radixsort.h"
#include <cstdio>
#include <climits>
#include <algorithm>
using namespace std;
void cuCall(CUresult res) {
if (res != CUDA_SUCCESS) {
printf("cuda error: %d\n", res);
exit(1);
}
}
void cudaInit(CUcontext &cuContext) {
cuCall(cuInit(0));
CUdevice cuDevice;
cuCall(cuDeviceGet(&cuDevice, 0));
cuCall(cuCtxCreate(&cuContext, 0, cuDevice));
}
void cudaDestroy(CUcontext &cuContext) {
cuCall(cuCtxDestroy(cuContext));
}
int* radixsort(int* T, int n) {
CUmodule cuModule = (CUmodule) 0;
cuCall(cuModuleLoad(&cuModule, "radixsort.ptx"));
CUfunction computeLocalPositions, computeGlobalPositions, permute;
cuCall(cuModuleGetFunction(&computeLocalPositions, cuModule, "computeLocalPositions"));
cuCall(cuModuleGetFunction(&computeGlobalPositions, cuModule, "computeGlobalPositions"));
cuCall(cuModuleGetFunction(&permute, cuModule, "permute"));
const int BLOCKS_PER_GRID = (n + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
cuCall(cuMemHostRegister(T, sizeof(int) * n * 3, 0));
CUdeviceptr in, pos, out, zerosInBlocks;
cuCall(cuMemAlloc(&in, sizeof(int) * n * 3));
cuCall(cuMemAlloc(&pos, sizeof(int) * n));
cuCall(cuMemAlloc(&out, sizeof(int) * n * 3));
cuCall(cuMemAlloc(&zerosInBlocks, sizeof(int) * BLOCKS_PER_GRID));
cuCall(cuMemcpyHtoD(in, T, sizeof(int) * n * 3));
int* zerosInBlocksHost = new int[BLOCKS_PER_GRID];
cuCall(cuMemHostRegister(zerosInBlocksHost, sizeof(int) * BLOCKS_PER_GRID, 0));
int* sorted = new int[n * 3];
cuCall(cuMemHostRegister(sorted, sizeof(int) * n * 3, 0));
for(int k = 0; k < 63; k++) {
void* args1[] = {&in, &n, &pos, &k, &zerosInBlocks};
cuCall(cuLaunchKernel(computeLocalPositions, BLOCKS_PER_GRID, 1, 1, THREADS_PER_BLOCK, 1, 1, 0, 0, args1, 0));
cuCall(cuMemcpyDtoH(zerosInBlocksHost, zerosInBlocks, sizeof(int) * BLOCKS_PER_GRID));
for(int i = 1; i < BLOCKS_PER_GRID; i++)
zerosInBlocksHost[i] += zerosInBlocksHost[i-1];
cuCall(cuMemcpyHtoD(zerosInBlocks, zerosInBlocksHost, sizeof(int) * BLOCKS_PER_GRID));
void* args2[] = {&in, &n, &pos, &k, &zerosInBlocks};
cuCall(cuLaunchKernel(computeGlobalPositions, BLOCKS_PER_GRID, 1, 1, THREADS_PER_BLOCK, 1, 1, 0, 0, args2, 0));
void* args3[] = {&in, &n, &out, &pos};
cuCall(cuLaunchKernel(permute, BLOCKS_PER_GRID, 1, 1, THREADS_PER_BLOCK, 1, 1, 0, 0, args3, 0));
swap(in, out);
}
cuCall(cuMemcpyDtoH(sorted, in, sizeof(int) * n * 3));
cuCall(cuMemHostUnregister(T));
cuCall(cuMemHostUnregister(zerosInBlocksHost));
cuCall(cuMemHostUnregister(sorted));
delete[] zerosInBlocksHost;
return sorted;
}