diff --git a/keopscore/keopscore/binders/LinkCompile.py b/keopscore/keopscore/binders/LinkCompile.py index aae5d70ea..d1ba38058 100644 --- a/keopscore/keopscore/binders/LinkCompile.py +++ b/keopscore/keopscore/binders/LinkCompile.py @@ -84,7 +84,11 @@ def read_info(self): def write_code(self): # write the generated code in the source file ; this is used as a subfunction of compile_code f = open(self.gencode_file, "w") - f.write(self.code) + if os.name == "nt": + f.write(self.code.replace("signed long int", "int")) + else: + f.write(self.code) + f.close() def generate_code(self): diff --git a/keopscore/keopscore/binders/nvrtc/Gpu_link_compile.py b/keopscore/keopscore/binders/nvrtc/Gpu_link_compile.py index ccd6eb00d..4833cec95 100644 --- a/keopscore/keopscore/binders/nvrtc/Gpu_link_compile.py +++ b/keopscore/keopscore/binders/nvrtc/Gpu_link_compile.py @@ -1,6 +1,5 @@ import os from ctypes import create_string_buffer, CDLL, c_int -from os import RTLD_LAZY import sysconfig from os.path import join @@ -34,10 +33,13 @@ def jit_compile_dll(): - return os.path.join( - build_folder, - "nvrtc_jit" + sysconfig.get_config_var("SHLIB_SUFFIX"), - ) + if os.name == "nt": + return os.path.join(build_folder, "nvrtc_jit.dll") + else: + return os.path.join( + build_folder, + "nvrtc_jit" + sysconfig.get_config_var("SHLIB_SUFFIX"), + ) class Gpu_link_compile(LinkCompile): @@ -61,7 +63,10 @@ def __init__(self): self.low_level_code_prefix + self.gencode_filename, ).encode("utf-8") - self.my_c_dll = CDLL(jit_compile_dll(), mode=RTLD_LAZY) + if os.name != "nt": + self.my_c_dll = CDLL(jit_compile_dll(), mode=os.RTLD_LAZY) + else: + self.my_c_dll = CDLL(jit_compile_dll()) # actual dll to be called is the jit binary, TODO: check if this is relevent self.true_dllname = jit_binary # file to check for existence to detect compilation is needed @@ -75,16 +80,27 @@ def generate_code(self): self.write_code() # we execute the main dll, passing the code as argument, and the name of the low level code file to save the assembly instructions - res = self.my_c_dll.Compile( - create_string_buffer(self.low_level_code_file), - create_string_buffer(self.code.encode("utf-8")), - c_int(self.use_half), - c_int(self.use_fast_math), - c_int(self.device_id), - create_string_buffer( - (custom_cuda_include_fp16_path() + os.path.sep).encode("utf-8") - ), - ) + if os.name != "nt": + res = self.my_c_dll.Compile( + create_string_buffer(self.low_level_code_file), + create_string_buffer(self.code.encode("utf-8")), + c_int(self.use_half), + c_int(self.use_fast_math), + c_int(self.device_id), + create_string_buffer( + (custom_cuda_include_fp16_path() + os.path.sep).encode("utf-8") + ), + ) + else: + res = self.my_c_dll.Compile( + create_string_buffer(self.low_level_code_file), + create_string_buffer(self.code.encode("utf-8")), + c_int(self.use_half), + c_int(self.device_id), + create_string_buffer( + (custom_cuda_include_fp16_path() + os.path.sep).encode("utf-8") + ), + ) if res != 0: KeOps_Error( f"Error when compiling formula (error in nvrtcCompileProgram, nvrtcResult={res})" @@ -116,8 +132,14 @@ def get_compile_command( @staticmethod def compile_jit_compile_dll(): KeOps_Message("Compiling cuda jit compiler engine ... ", flush=True, end="") - command = Gpu_link_compile.get_compile_command( - sourcename=jit_compile_src, dllname=jit_compile_dll() - ) - KeOps_OS_Run(command) + if os.name == "nt": + from ...windows_compilations import compile_nvrtc_jit + + compile_nvrtc_jit(build_folder=build_folder) + else: + command = Gpu_link_compile.get_compile_command( + sourcename=jit_compile_src, dllname=jit_compile_dll() + ) + KeOps_OS_Run(command) + KeOps_Message("OK", use_tag=False, flush=True) diff --git a/keopscore/keopscore/binders/nvrtc/keops_nvrtc_win.cpp b/keopscore/keopscore/binders/nvrtc/keops_nvrtc_win.cpp new file mode 100644 index 000000000..017201363 --- /dev/null +++ b/keopscore/keopscore/binders/nvrtc/keops_nvrtc_win.cpp @@ -0,0 +1,604 @@ + +// nvcc -shared -Xcompiler -fPIC -lnvrtc -lcuda keops_nvrtc.cu -o keops_nvrtc.so +// g++ --verbose -L/opt/cuda/lib64 -L/opt/cuda/targets/x86_64-linux/lib/ -I/opt/cuda/targets/x86_64-linux/include/ -I../../include -shared -fPIC -lcuda -lnvrtc -fpermissive -DMAXIDGPU=0 -DMAXTHREADSPERBLOCK0=1024 -DSHAREDMEMPERBLOCK0=49152 -DnvrtcGetTARGET=nvrtcGetCUBIN -DnvrtcGetTARGETSize=nvrtcGetCUBINSize -DARCHTAG=\"sm\" keops_nvrtc.cpp -o keops_nvrtc.so +// g++ -std=c++11 -shared -fPIC -O3 -fpermissive -L /usr/lib -L /opt/cuda/lib64 -lcuda -lnvrtc -DnvrtcGetTARGET=nvrtcGetCUBIN -DnvrtcGetTARGETSize=nvrtcGetCUBINSize -DARCHTAG=\"sm\" -I/home/bcharlier/projets/keops/keops/keops/include -I/opt/cuda/include -I/usr/include/python3.10/ -DMAXIDGPU=0 -DMAXTHREADSPERBLOCK0=1024 -DSHAREDMEMPERBLOCK0=49152 /home/bcharlier/projets/keops/keops/keops/binders/nvrtc/keops_nvrtc.cpp -o keops_nvrtc.cpython-310-x86_64-linux-gnu.so + +#include +#include +#include +#include +#include +#include +#include +#include +#include +//#include + +#define C_CONTIGUOUS 1 +#define USE_HALF 0 + +#include "include/Sizes_win.h" +#include "include/Ranges_win.h" +#include "include/utils_pe_win.h" +#include "include/ranges_utils_win.h" + + +#include "include/CudaSizes_win.h" +#include + + +int *build_offset_tables(int nbatchdims, int *shapes, int nblocks, int *lookup_h, + const std::vector< int > &indsi, + const std::vector< int > &indsj, + const std::vector< int > &indsp, + int tagJ) { + + int sizei = indsi.size(); + int sizej = indsj.size(); + int sizep = indsp.size(); + + // Support for broadcasting over batch dimensions ============================================= + + int sizevars = sizei + sizej + sizep; + + // Separate and store the shapes of the "i" and "j" variables + parameters -------------- + // + // shapes is an array of size (1+nargs)*(nbatchdims+3), which looks like: + // [ A, .., B, M, N, D_out] -> output + // [ A, .., B, M, 1, D_1 ] -> "i" variable + // [ A, .., B, 1, N, D_2 ] -> "j" variable + // [ A, .., B, 1, 1, D_3 ] -> "parameter" + // [ A, .., 1, M, 1, D_4 ] -> N.B.: we support broadcasting on the batch dimensions! + // [ 1, .., 1, M, 1, D_5 ] -> (we'll just ask users to fill in the shapes with *explicit* ones) + + //int shapes_i[sizei * (nbatchdims + 1)], shapes_j[sizej * (nbatchdims + 1)], shapes_p[sizep * (nbatchdims + 1)]; + std::vector shapes_i(sizei * (nbatchdims + 1)); + std::vector shapes_j(sizej * (nbatchdims + 1)); + std::vector shapes_p(sizep * (nbatchdims + 1)); + + // First, we fill shapes_i with the "relevant" shapes of the "i" variables, + // making it look like, say: + // [ A, .., B, M] + // [ A, .., 1, M] + // [ A, .., A, M] + // Then, we do the same for shapes_j, but with "N" instead of "M". + // And finally for the parameters, with "1" instead of "M". + fill_shapes(nbatchdims, shapes, shapes_i.data(), shapes_j.data(), shapes_p.data(), tagJ, indsi, indsj, indsp); + + int tagIJ = tagJ; // 1 if the reduction is made "over j", 0 if it is made "over i" + int M = shapes[nbatchdims], N = shapes[nbatchdims + 1]; + + // We create a lookup table, "offsets", of shape (nblocks, SIZEVARS) -------- + int *offsets_d = NULL; + + //int offsets_h[nblocks * sizevars]; + std::vector offsets_h(nblocks * sizevars); + + for (int k = 0; k < nblocks; k++) { + int range_id = (int) lookup_h[3 * k]; + int start_x = tagIJ ? range_id * M : range_id * N; + int start_y = tagIJ ? range_id * N : range_id * M; + + int patch_offset = (int) (lookup_h[3 * k + 1] - start_x); + + vect_broadcast_index(start_x, nbatchdims, sizei, shapes, shapes_i.data(), offsets_h.data() + k * sizevars, patch_offset); + vect_broadcast_index(start_y, nbatchdims, sizej, shapes, shapes_j.data(), offsets_h.data() + k * sizevars + sizei); + vect_broadcast_index(range_id, nbatchdims, sizep, shapes, shapes_p.data(), offsets_h.data() + k * sizevars + sizei + sizej); + } + + CUDA_SAFE_CALL(cuMemAlloc((CUdeviceptr * ) & offsets_d, sizeof(int) * nblocks * sizevars)); + CUDA_SAFE_CALL(cuMemcpyHtoD((CUdeviceptr) offsets_d, offsets_h.data(), sizeof(int) * nblocks * sizevars)); + + return offsets_d; +} + + +void range_preprocess_from_device(int &nblocks, int tagI, int nranges_x, int nranges_y, int **castedranges, + int nbatchdims, int *&slices_x_d, int *&ranges_y_d, + int *&lookup_d, int *&offsets_d, int blockSize_x, + const std::vector< int > &indsi, + const std::vector< int > &indsj, + const std::vector< int > &indsp, + int *shapes) { + + // Ranges pre-processing... ================================================================== + + // N.B.: In the following code, we assume that the x-ranges do not overlap. + // Otherwise, we'd have to assume that DIMRED == DIMOUT + // or allocate a buffer of size nx * DIMRED. This may be done in the future. + // Cf. reduction.h: + // FUN::tagJ = 1 for a reduction over j, result indexed by i + // FUN::tagJ = 0 for a reduction over i, result indexed by j + + int tagJ = 1 - tagI; + int nranges = tagJ ? nranges_x : nranges_y; + + int *ranges_x = tagJ ? castedranges[0] : castedranges[3]; + int *slices_x = tagJ ? castedranges[1] : castedranges[4]; + int *ranges_y = tagJ ? castedranges[2] : castedranges[5]; + + std::vector ranges_x_h_arr(2 * nranges); + int* ranges_x_h; + + // The code below needs a pointer to ranges_x on *host* memory, ------------------- + // as well as pointers to slices_x and ranges_y on *device* memory. + // -> Depending on the "ranges" location, we'll copy ranges_x *or* slices_x and ranges_y + // to the appropriate memory: + bool ranges_on_device = (nbatchdims == 0); + // N.B.: We only support Host ranges with Device data when these ranges were created + // to emulate block-sparse reductions. + + if (ranges_on_device) { // The ranges are on the device + ranges_x_h = &ranges_x_h_arr[0]; + // Send data from device to host. + cuMemcpyDtoH(ranges_x_h, (CUdeviceptr) ranges_x, sizeof(int) * 2 * nranges); + slices_x_d = slices_x; + ranges_y_d = ranges_y; + } else { // The ranges are on host memory; this is typically what happens with **batch processing**, + // with ranges generated by keops_io.h: + ranges_x_h = ranges_x; + // Copy "slices_x" to the device: + CUDA_SAFE_CALL(cuMemAlloc((CUdeviceptr * ) & slices_x_d, sizeof(int) * nranges)); + CUDA_SAFE_CALL(cuMemcpyHtoD((CUdeviceptr) slices_x_d, slices_x, sizeof(int) * nranges)); + + // Copy "redranges_y" to the device: with batch processing, we KNOW that they have the same shape as ranges_x + CUDA_SAFE_CALL(cuMemAlloc((CUdeviceptr * ) & ranges_y_d, sizeof(int) * 2 * nranges)); + CUDA_SAFE_CALL(cuMemcpyHtoD((CUdeviceptr) ranges_y_d, ranges_y, sizeof(int) * 2 * nranges)); + } + + // Computes the number of blocks needed --------------------------------------------- + nblocks = 0; + int len_range = 0; + for (int i = 0; i < nranges; i++) { + len_range = ranges_x_h[2 * i + 1] - ranges_x_h[2 * i]; + nblocks += (len_range / blockSize_x) + (len_range % blockSize_x == 0 ? 0 : 1); + } + + // Create a lookup table for the blocks -------------------------------------------- + std::vector lookup_h(3 * nblocks); + int index = 0; + + for (int i = 0; i < nranges; i++) { + len_range = ranges_x_h[2 * i + 1] - ranges_x_h[2 * i]; + for (int j = 0; j < len_range; j += blockSize_x) { + lookup_h[3 * index] = i; + lookup_h[3 * index + 1] = ranges_x_h[2 * i] + j; + lookup_h[3 * index + 2] = ranges_x_h[2 * i] + j + std::min((int) blockSize_x, len_range - j); + index++; + } + } + + // Load the table on the device ----------------------------------------------------- + CUDA_SAFE_CALL(cuMemAlloc((CUdeviceptr * ) &lookup_d, sizeof(int) * 3 * nblocks)); + CUDA_SAFE_CALL(cuMemcpyHtoD((CUdeviceptr) lookup_d, lookup_h.data(), sizeof(int) * 3 * nblocks)); + + + // Support for broadcasting over batch dimensions ============================================= + + // We create a lookup table, "offsets", of shape (nblock, SIZEVARS): + + if (nbatchdims > 0) { + offsets_d = build_offset_tables(nbatchdims, shapes, nblocks, lookup_h.data(), + indsi, indsj, indsp, tagJ); + } + + +} + + +void +range_preprocess_from_host(int &nblocks, int tagI, int nranges_x, int nranges_y, int nredranges_x, int nredranges_y, + int **castedranges, + int nbatchdims, int *&slices_x_d, int *&ranges_y_d, + int *&lookup_d, int *&offsets_d, int blockSize_x, + const std::vector< int > &indsi, + const std::vector< int > &indsj, + const std::vector< int > &indsp, + int *shapes) { + + // Ranges pre-processing... ================================================================== + + // N.B.: In the following code, we assume that the x-ranges do not overlap. + // Otherwise, we'd have to assume that DIMRED == DIMOUT + // or allocate a buffer of size nx * DIMRED. This may be done in the future. + // Cf. reduction.h: + // FUN::tagJ = 1 for a reduction over j, result indexed by i + // FUN::tagJ = 0 for a reduction over i, result indexed by j + + int tagJ = 1 - tagI; + int nranges = tagJ ? nranges_x : nranges_y; + int nredranges = tagJ ? nredranges_y : nredranges_x; + + int *ranges_x = tagJ ? castedranges[0] : castedranges[3]; + int *slices_x = tagJ ? castedranges[1] : castedranges[4]; + int *ranges_y = tagJ ? castedranges[2] : castedranges[5]; + + // Computes the number of blocks needed --------------------------------------------- + nblocks = 0; + int len_range = 0; + for (int i = 0; i < nranges; i++) { + len_range = ranges_x[2 * i + 1] - ranges_x[2 * i]; + nblocks += (len_range / blockSize_x) + (len_range % blockSize_x == 0 ? 0 : 1); + } + + // Create a lookup table for the blocks -------------------------------------------- + std::vector lookup_h(3 * nblocks); + int index = 0; + + for (int i = 0; i < nranges; i++) { + len_range = ranges_x[2 * i + 1] - ranges_x[2 * i]; + for (int j = 0; j < len_range; j += blockSize_x) { + lookup_h[3 * index] = i; + lookup_h[3 * index + 1] = ranges_x[2 * i] + j; + lookup_h[3 * index + 2] = ranges_x[2 * i] + j + std::min((int) blockSize_x, len_range - j); + index++; + } + } + + // Load the table on the device ----------------------------------------------------- + CUDA_SAFE_CALL(cuMemAlloc((CUdeviceptr * ) & lookup_d, sizeof(int) * 3 * nblocks)); + CUDA_SAFE_CALL(cuMemcpyHtoD((CUdeviceptr) lookup_d, lookup_h.data(), sizeof(int) * 3 * nblocks)); + + // Send data from host to device: + CUDA_SAFE_CALL(cuMemAlloc((CUdeviceptr * ) & slices_x_d, sizeof(int) * 2 * nranges)); + CUDA_SAFE_CALL(cuMemcpyHtoD((CUdeviceptr) slices_x_d, slices_x, sizeof(int) * 2 * nranges)); + + CUDA_SAFE_CALL(cuMemAlloc((CUdeviceptr * ) & ranges_y_d, sizeof(int) * 2 * nredranges)); + CUDA_SAFE_CALL(cuMemcpyHtoD((CUdeviceptr) ranges_y_d, ranges_y, sizeof(int) * 2 * nredranges)); + + + // Support for broadcasting over batch dimensions ============================================= + + // We create a lookup table, "offsets", of shape (nblock, SIZEVARS): + + if (nbatchdims > 0) { + offsets_d = build_offset_tables(nbatchdims, shapes, nblocks, lookup_h.data(), + indsi, indsj, indsp, tagJ); + } + + +} + + +template< typename TYPE > +class KeOps_module { +public : + + CUdevice cuDevice; + CUcontext ctx; + CUmodule module; + char *target; + CUdeviceptr buffer; + int nargs; + + void SetContext() { + CUcontext current_ctx; + CUDA_SAFE_CALL_NO_EXCEPTION(cuCtxGetCurrent(¤t_ctx)); + if (current_ctx != ctx) + CUDA_SAFE_CALL_NO_EXCEPTION(cuCtxPushCurrent(ctx)); + CUDA_SAFE_CALL_NO_EXCEPTION(cuCtxGetCurrent(¤t_ctx)); + } + + + void Read_Target(const char *target_file_name) { + std::ifstream rf(target_file_name, std::ifstream::binary); + size_t targetSize; + rf.read((char *) &targetSize, sizeof(size_t)); + target = new char[targetSize]; + rf.read(target, targetSize); + rf.close(); + + } + + + KeOps_module(int device_id, int nargs_, const char *target_file_name) { + + nargs = nargs_; + + // init cuda in case not already done + CUDA_SAFE_CALL(cuInit(0)); + + // get the device and the primary context corresponding to device_id + CUDA_SAFE_CALL(cuDeviceGet(&cuDevice, device_id)); + CUDA_SAFE_CALL(cuDevicePrimaryCtxRetain(&ctx, cuDevice)); + + // set the primary context as the active current context + SetContext(); + + // set global variables giving some properties of device + SetGpuProps(device_id); + + // read the ptx or cubin file into a char array + Read_Target(target_file_name); + + // load the corresponding module + CUDA_SAFE_CALL(cuModuleLoadDataEx(&module, target, 0, NULL, NULL)); + + // allocate a small memory buffer for "on device" computation mode, + // This is just used for storing the list of pointers to device data + // as a device array ; it is better to allocate it here once for all, + // otherwise allocating it at each call may cause a small overhead. + CUDA_SAFE_CALL(cuMemAlloc(&buffer, nargs * sizeof(TYPE *))); + + } + + + ~KeOps_module() { + SetContext(); + CUDA_SAFE_CALL_NO_EXCEPTION(cuMemFree(buffer)); + CUDA_SAFE_CALL_NO_EXCEPTION(cuModuleUnload(module)); + CUDA_SAFE_CALL_NO_EXCEPTION(cuDevicePrimaryCtxRelease(cuDevice)); + delete[] target; + } + + int launch_kernel(int tagHostDevice, int dimY, int nx, int ny, + int tagI, int tagZero, int use_half, + int tag1D2D, int dimred, + int cuda_block_size, int use_chunk_mode, + std::vector< int > indsi, std::vector< int > indsj, std::vector< int > indsp, + int dimout, + std::vector< int > dimsx, std::vector< int > dimsy, std::vector< int > dimsp, + int **ranges, + std::vector< int > shapeout, TYPE *out, + TYPE **arg, + std::vector > argshape + ) { + + + SetContext(); + + ////end_ = clock(); + ////std::cout << " time for set device : " << double(//end_ - start_) / CLOCKS_PER_SEC << std::endl; + //start_ = clock(); + + Sizes SS(nargs, arg, argshape, nx, ny, + tagI, use_half, + dimout, + indsi, indsj, indsp, + dimsx, dimsy, dimsp); + + //end_ = clock(); + //std::cout << " time for Sizes : " << double(//end_ - start_) / CLOCKS_PER_SEC << std::endl; + //start_ = clock(); + + if (use_half) + SS.switch_to_half2_indexing(); + + Ranges RR(SS, ranges); + nx = SS.nx; + ny = SS.ny; + + //end_ = clock(); + //std::cout << " time for Ranges : " << double(//end_ - start_) / CLOCKS_PER_SEC << std::endl; + //start_ = clock(); + + // now we switch (back...) indsi, indsj and dimsx, dimsy in case tagI=1. + // This is to be consistent with the convention used in the old + // bindings where i and j variables had different meanings in bindings + // and in the core code. Clearly we could do better if we + // carefully rewrite some parts of the code + if (tagI == 1) { + std::vector< int > tmp; + + tmp = indsj; + indsj = indsi; + indsi = tmp; + + tmp = dimsy; + dimsy = dimsx; + dimsx = tmp; + } + + + int blockSize_x = 1, blockSize_y = 1, blockSize_z = 1; + + if (use_chunk_mode == 0) { + // warning : blockSize.x was previously set to CUDA_BLOCK_SIZE; currently CUDA_BLOCK_SIZE value is used as a bound. + blockSize_x = std::min(cuda_block_size, + std::min(maxThreadsPerBlock, + (int) (sharedMemPerBlock / std::max(1, (int) (dimY * sizeof(TYPE)))) + ) + ); // number of threads in each block + } else { + // warning : the value here must match the one which is set in file GpuReduc1D_chunks.py, line 59 + // and file GpuReduc1D_finalchunks.py, line 67 + blockSize_x = std::min(cuda_block_size, + std::min(1024, (int) (49152 / std::max(1, (int) (dimY * sizeof(TYPE))))) + ); + } + + int nblocks; + + if (tagI == 1) { + int tmp = ny; + ny = nx; + nx = tmp; + } + + int *lookup_d = NULL, *slices_x_d = NULL, *ranges_y_d = NULL; + int *offsets_d = NULL; + + if (RR.tagRanges == 1) { + if (tagHostDevice == 1) { + range_preprocess_from_device(nblocks, tagI, RR.nranges_x, RR.nranges_y, RR.castedranges, + SS.nbatchdims, slices_x_d, ranges_y_d, lookup_d, + offsets_d, + blockSize_x, indsi, indsj, indsp, SS.shapes); + } else { // tagHostDevice==0 + range_preprocess_from_host(nblocks, tagI, RR.nranges_x, RR.nranges_y, RR.nredranges_x, RR.nredranges_y, + RR.castedranges, + SS.nbatchdims, slices_x_d, ranges_y_d, lookup_d, + offsets_d, + blockSize_x, indsi, indsj, indsp, SS.shapes); + } + } + + ////end_ = clock(); + ////std::cout << " time for interm : " << double(//end_ - start_) / CLOCKS_PER_SEC << std::endl; + //start_ = clock(); + + CUdeviceptr p_data; + TYPE *out_d; + TYPE **arg_d; + + int sizeout = std::accumulate(shapeout.begin(), shapeout.end(), 1, std::multiplies< int >()); + + if (tagHostDevice == 1) { + p_data = buffer; + load_args_FromDevice(p_data, out, out_d, nargs, arg, arg_d); + } else + load_args_FromHost(p_data, out, out_d, nargs, arg, arg_d, argshape, sizeout); + + ////end_ = clock(); + ////std::cout << " time for load_args : " << double(//end_ - start_) / CLOCKS_PER_SEC << std::endl; + //start_ = clock(); + + CUfunction kernel; + + int gridSize_x = 1, gridSize_y = 1, gridSize_z = 1; + + if (tag1D2D == 1) { // 2D scheme + + gridSize_x = nx / blockSize_x + (nx % blockSize_x == 0 ? 0 : 1); + gridSize_y = ny / blockSize_x + (ny % blockSize_x == 0 ? 0 : 1); + + // Reduce : grid and block are both 1d + int blockSize2_x = 1, blockSize2_y = 1, blockSize2_z = 1; + blockSize2_x = blockSize_x; // number of threads in each block + int gridSize2_x = 1, gridSize2_y = 1, gridSize2_z = 1; + gridSize2_x = (nx * dimred) / blockSize2_x + ((nx * dimred) % blockSize2_x == 0 ? 0 : 1); + + // Data on the device. We need an "inflated" outB, which contains gridSize.y "copies" of out + // that will be reduced in the final pass. + TYPE *outB; + + // single cudaMalloc + CUdeviceptr p_data_outB; + CUDA_SAFE_CALL(cuMemAlloc(&p_data_outB, sizeof(TYPE) * (nx * dimred * gridSize_y))); + + outB = (TYPE *) ((TYPE **) p_data); + + CUDA_SAFE_CALL(cuModuleGetFunction(&kernel, module, "GpuConv2DOnDevice")); + + void *kernel_params[4]; + kernel_params[0] = &nx; + kernel_params[1] = &ny; + kernel_params[2] = &outB; + kernel_params[3] = &arg_d; + + // Size of the SharedData : blockSize.x*(DIMY)*sizeof(TYPE) + + CUDA_SAFE_CALL(cuLaunchKernel(kernel, + gridSize_x, gridSize_y, gridSize_z, // grid dim + blockSize_x, blockSize_y, blockSize_z, // block dim + blockSize_x * dimY * sizeof(TYPE), NULL, // shared mem and stream + kernel_params, 0)); + // block until the device has completed + CUDA_SAFE_CALL(cuCtxSynchronize()); + + // Since we've used a 2D scheme, there's still a "blockwise" line reduction to make on + // the output array px_d[0] = x1B. We go from shape ( gridSize.y * nx, DIMRED ) to (nx, DIMOUT) + CUfunction kernel_reduce; + CUDA_SAFE_CALL(cuModuleGetFunction(&kernel_reduce, module, "reduce2D")); + void *kernel_reduce_params[4]; + kernel_reduce_params[0] = &outB; + kernel_reduce_params[1] = &out_d; + kernel_reduce_params[2] = &gridSize_y; + kernel_reduce_params[3] = &nx; + + CUDA_SAFE_CALL(cuLaunchKernel(kernel_reduce, + gridSize2_x, gridSize2_y, gridSize2_z, // grid dim + blockSize2_x, blockSize2_y, blockSize2_z, // block dim + 0, NULL, // shared mem and stream + kernel_reduce_params, 0)); + + + } else if (RR.tagRanges == 1 && tagZero == 0) { + // ranges mode + + gridSize_x = nblocks; + + CUDA_SAFE_CALL(cuModuleGetFunction(&kernel, module, "GpuConv1DOnDevice_ranges")); + // std::cout << "GpuConv1DOnDevice_ranges " << nx << " " << gridSize_x ; + void *kernel_params[9]; + kernel_params[0] = &nx; + kernel_params[1] = &ny; + kernel_params[2] = &SS.nbatchdims; + kernel_params[3] = &offsets_d; + kernel_params[4] = &lookup_d; + kernel_params[5] = &slices_x_d; + kernel_params[6] = &ranges_y_d; + kernel_params[7] = &out_d; + kernel_params[8] = &arg_d; + + CUDA_SAFE_CALL(cuLaunchKernel(kernel, + gridSize_x, gridSize_y, gridSize_z, // grid dim + blockSize_x, blockSize_y, blockSize_z, // block dim + blockSize_x * dimY * sizeof(TYPE), NULL, // shared mem and stream + kernel_params, 0)); // arguments + + } else { + // simple mode + + gridSize_x = nx / blockSize_x + (nx % blockSize_x == 0 ? 0 : 1); + + CUDA_SAFE_CALL(cuModuleGetFunction(&kernel, module, "GpuConv1DOnDevice")); + + void *kernel_params[4]; + kernel_params[0] = &nx; + kernel_params[1] = &ny; + kernel_params[2] = &out_d; + kernel_params[3] = &arg_d; + + //std::cout << "GpuConv1DOnDevice " << nx << " " << gridSize_x ;//<< " " << gridSize_y << " " << gridSize_z << " " << blockSize_x << " " << blockSize_y << " " << blockSize_z << " " << blockSize_x * dimY * sizeof(TYPE) << std::endl; + + CUDA_SAFE_CALL(cuLaunchKernel(kernel, + gridSize_x, gridSize_y, gridSize_z, // grid dim + blockSize_x, blockSize_y, blockSize_z, // block dim + blockSize_x * dimY * sizeof(TYPE), NULL, // shared mem and stream + kernel_params, 0)); // arguments + } + + CUDA_SAFE_CALL(cuCtxSynchronize()); + + ////end_ = clock(); + ////std::cout << " time for kernel : " << double(//end_ - start_) / CLOCKS_PER_SEC << std::endl; + //start_ = clock(); + + // Send data from device to host. + + + if (tagHostDevice == 0) { + + CUDA_SAFE_CALL(cuMemcpyDtoH(out, (CUdeviceptr) out_d, sizeof(TYPE) * sizeout)); + CUDA_SAFE_CALL(cuMemFree(p_data)); + + } + + if (RR.tagRanges == 1) { + CUDA_SAFE_CALL(cuMemFree((CUdeviceptr) lookup_d)); + if (SS.nbatchdims > 0) { + CUDA_SAFE_CALL(cuMemFree((CUdeviceptr) slices_x_d)); + CUDA_SAFE_CALL(cuMemFree((CUdeviceptr) ranges_y_d)); + CUDA_SAFE_CALL(cuMemFree((CUdeviceptr) offsets_d)); + } + } + + //end_ = end = clock(); + ////std::cout << " time for last part : " << double(//end_ - start_) / CLOCKS_PER_SEC << std::endl; + ////std::cout << "time for launch_keops inner : " << double(end - start) / CLOCKS_PER_SEC << std::endl; + + return 0; + } + +}; + + +template +class KeOps_module< float >; + +template +class KeOps_module< double >; + +template +class KeOps_module< half2 >; diff --git a/keopscore/keopscore/binders/nvrtc/nvrtc_jit_win.cpp b/keopscore/keopscore/binders/nvrtc/nvrtc_jit_win.cpp new file mode 100644 index 000000000..39bb6c72c --- /dev/null +++ b/keopscore/keopscore/binders/nvrtc/nvrtc_jit_win.cpp @@ -0,0 +1,114 @@ +// nvcc -shared -Xcompiler -fPIC -lnvrtc -lcuda keops_nvrtc.cu -o keops_nvrtc.so +// g++ --verbose -L/opt/cuda/lib64 -L/opt/cuda/targets/x86_64-linux/lib/ -I/opt/cuda/targets/x86_64-linux/include/ -I../../include -shared -fPIC -lcuda -lnvrtc -fpermissive -DMAXIDGPU=0 -DMAXTHREADSPERBLOCK0=1024 -DSHAREDMEMPERBLOCK0=49152 -DnvrtcGetTARGET=nvrtcGetCUBIN -DnvrtcGetTARGETSize=nvrtcGetCUBINSize -DARCHTAG=\"sm\" keops_nvrtc.cpp -o keops_nvrtc.so +// g++ -std=c++11 -shared -fPIC -O3 -fpermissive -L /usr/lib -L /opt/cuda/lib64 -lcuda -lnvrtc -DnvrtcGetTARGET=nvrtcGetCUBIN -DnvrtcGetTARGETSize=nvrtcGetCUBINSize -DARCHTAG=\"sm\" -I/home/bcharlier/projets/keops/keops/keops/include -I/opt/cuda/include -I/usr/include/python3.10/ -DMAXIDGPU=0 -DMAXTHREADSPERBLOCK0=1024 -DSHAREDMEMPERBLOCK0=49152 /home/bcharlier/projets/keops/keops/keops/binders/nvrtc/keops_nvrtc.cpp -o keops_nvrtc.cpython-310-x86_64-linux-gnu.so + +#include +#include +#include +#include +#include +#include +#include +#include +#include +//#include + +#define C_CONTIGUOUS 1 +#define USE_HALF 0 + +#ifdef _WIN32 +#define DLL_EXPORT extern "C" __declspec(dllexport) +#else +#define DLL_EXPORT extern "C" +#endif + +#include "include/Sizes_win.h" +#include "include/Ranges_win.h" +#include "include/utils_pe_win.h" +#include "include/ranges_utils_win.h" + + +#include "include/CudaSizes_win.h" +#include + +DLL_EXPORT +int Compile(const char *target_file_name, const char *cu_code, int use_half, int device_id, + const char *cuda_include_path) { + + nvrtcProgram prog; + + int numHeaders; + const char *header_names[2]; + const char *header_sources[2]; + + std::ostringstream cuda_fp16_h_path, cuda_fp16_hpp_path; + cuda_fp16_h_path << cuda_include_path << "cuda_fp16.h" ; + cuda_fp16_hpp_path << cuda_include_path << "cuda_fp16.hpp" ; + + if (use_half) { + numHeaders = 2; + header_names[0] = "cuda_fp16.h"; + header_sources[0] = read_text_file(cuda_fp16_h_path.str().c_str()); + + header_names[1] = "cuda_fp16.hpp"; + header_sources[1] = read_text_file(cuda_fp16_hpp_path.str().c_str()); + + } else { + numHeaders = 0; + } + + // Get device id from Driver API + CUdevice cuDevice; + CUDA_SAFE_CALL(cuDeviceGet(&cuDevice, device_id)); + + // Get Compute Capability from Driver API + int deviceProp_major, deviceProp_minor; + CUDA_SAFE_CALL(cuDeviceGetAttribute(&deviceProp_major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice)); + CUDA_SAFE_CALL(cuDeviceGetAttribute(&deviceProp_minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice)); + + std::ostringstream arch_flag; + arch_flag << "-arch=" << ARCHTAG << "_" << deviceProp_major << deviceProp_minor; + + char *arch_flag_char = new char[arch_flag.str().length()]; + arch_flag_char = strdup(arch_flag.str().c_str()); + const char *opts[] = {arch_flag_char, "-use_fast_math"}; + + NVRTC_SAFE_CALL(nvrtcCreateProgram(&prog, // prog + cu_code, // buffer + NULL, // name + numHeaders, // numHeaders + header_sources, // headers + header_names // includeNames + )); + + nvrtcResult compileResult = nvrtcCompileProgram(prog, // prog + 2, // numOptions + opts); // options + + if (compileResult != NVRTC_SUCCESS) { + throw std::runtime_error("[KeOps] Error when compiling formula (error in nvrtcCompileProgram)."); + } + + delete[] arch_flag_char; + + // Obtain PTX or CUBIN from the program. + size_t targetSize; + NVRTC_SAFE_CALL(nvrtcGetTARGETSize(prog, &targetSize)); + + char *target = new char[targetSize]; + NVRTC_SAFE_CALL(nvrtcGetTARGET(prog, target)); + + // Destroy the program. + NVRTC_SAFE_CALL(nvrtcDestroyProgram(&prog)); + + // write PTX code to file + + std::ofstream wf(target_file_name, std::ofstream::binary); + wf.write((char*)&targetSize, sizeof(size_t)); + wf.write(target, targetSize); + wf.close(); + + delete[] target; + + return 0; +} diff --git a/keopscore/keopscore/config/__init__.py b/keopscore/keopscore/config/__init__.py index fc31a0b9f..839b05793 100644 --- a/keopscore/keopscore/config/__init__.py +++ b/keopscore/keopscore/config/__init__.py @@ -1,6 +1,13 @@ # Import the configuration classes from .base_config import Config -from .cuda import CUDAConfig +import os + +if os.name != "nt": + from .cuda import CUDAConfig +else: + from .cuda_windows import CUDAConfigWin as CUDAConfig + +# TODO openmp and c++ compiler detection for windows from .openmp import OpenMPConfig from .Platform import DetectPlatform diff --git a/keopscore/keopscore/config/cuda_windows.py b/keopscore/keopscore/config/cuda_windows.py new file mode 100644 index 000000000..0660ea010 --- /dev/null +++ b/keopscore/keopscore/config/cuda_windows.py @@ -0,0 +1,212 @@ +import os +import ctypes +from ctypes.util import find_library +from ctypes import ( + c_int, + c_void_p, + c_char_p, + CDLL, + byref, + cast, + POINTER, + Structure, + RTLD_GLOBAL, +) +from pathlib import Path +import shutil +from os.path import join +import platform +import tempfile +import subprocess +import sys +import keopscore +from keopscore.utils.misc_utils import KeOps_Warning +from keopscore.utils.misc_utils import KeOps_OS_Run +from keopscore.utils.misc_utils import CHECK_MARK, CROSS_MARK + +from .cuda import CUDAConfig + +from ..windows_compilations import cuda_detection + + +cuda_available = cuda_detection.cuda_available + +detection = cuda_detection.detect_cuda_toolkit() +# cuda_lib = detection['lib_dirs'] +# cuda_include = detection['include_dir'] +# cuda_dll = detection['dll_cuda'] +# cudart_dll = detection['dll_cudart'] +# cuda_nvrtc = detection['dll_nvrtc'] + + +class CUDAConfigWin(CUDAConfig): + """ + Class for CUDA detection on windows and configuration. + """ + + # CUDA constants + CUDA_SUCCESS = 0 + CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1 + CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8 + + def set_use_cuda(self): + """Determine and set whether to use CUDA.""" + self._use_cuda = cuda_detection.cuda_available + + if not self._use_cuda: + self.cuda_message = "CUDA libraries not detected; Switching to CPU only." + KeOps_Warning(self.cuda_message) + + # Check if both cuda and nvrtc libraries are available + if not self._cuda_libraries_available(): + self._use_cuda = False + + self.get_cuda_version() + self.get_cuda_include_path() + self.get_gpu_props() + + if self.n_gpus == 0 and self._use_cuda: + self._use_cuda = False + self.cuda_message = "CUDA libraries detected, but no GPUs found on this system; Switching to CPU only." + KeOps_Warning(self.cuda_message) + + def _cuda_libraries_available(self): + """ + Check if both cuda and nvrtc libraries are available. + Returns: + True if both cuda and nvrtc are loadable, False otherwise. + This is also where we handle one single warning if needed. + """ + + return "dll_nvrtc" in detection and "dll_cuda" in detection + + def get_cuda_version(self, out_type="single_value"): + + if not self._use_cuda: + self.cuda_version = None + return None + try: + + libcudart = ctypes.CDLL(detection["dll_cudart"]) + cuda_version = ctypes.c_int() + libcudart.cudaRuntimeGetVersion(ctypes.byref(cuda_version)) + cuda_version_value = int(cuda_version.value) + + if out_type == "single_value": + self.cuda_version = cuda_version_value + return cuda_version_value + + major = cuda_version_value // 1000 + minor = (cuda_version_value % 1000) // 10 + + if out_type == "major,minor": + return major, minor + elif out_type == "string": + return f"{major}.{minor}" + except Exception as e: + KeOps_Warning(f"Could not determine CUDA version: {e}") + self.cuda_version = None + return None + + def get_gpu_props(self): + """ + Getting GPU properties and related attributes. + """ + if not self._use_cuda: + # Already determined that CUDA is unavailable + self.n_gpus = 0 + self.gpu_compile_flags = "" + return (self.n_gpus, self.gpu_compile_flags) + + # Attempt to load the CUDA driver library + libcuda_path = detection["dll_cuda"] + + # We have a handle, let's proceed + libcuda = ctypes.CDLL(libcuda_path) + result = libcuda.cuInit(0) + if result != self.CUDA_SUCCESS: + KeOps_Warning( + "CUDA was detected, but driver API could not be initialized. Switching to CPU only." + ) + self.n_gpus = 0 + self.gpu_compile_flags = "" + self._use_cuda = False + return (self.n_gpus, self.gpu_compile_flags) + + # Get GPU count + nGpus = ctypes.c_int() + result = libcuda.cuDeviceGetCount(ctypes.byref(nGpus)) + if result != self.CUDA_SUCCESS: + KeOps_Warning( + "CUDA was detected and driver API was initialized, but no working GPU found. " + "Switching to CPU only." + ) + self.n_gpus = 0 + self.gpu_compile_flags = "" + self._use_cuda = False + return (self.n_gpus, self.gpu_compile_flags) + + self.n_gpus = nGpus.value + if self.n_gpus == 0: + self.gpu_compile_flags = "" + return (self.n_gpus, self.gpu_compile_flags) + + # Query each GPU for properties + MaxThreadsPerBlock = [0] * self.n_gpus + SharedMemPerBlock = [0] * self.n_gpus + + def safe_call(dev_idx, result_code): + if result_code != self.CUDA_SUCCESS: + KeOps_Warning( + f"Error detecting properties for GPU device {dev_idx}. " + "Switching to CPU only." + ) + return False + return True + + for d in range(self.n_gpus): + device = ctypes.c_int() + if not safe_call(d, libcuda.cuDeviceGet(ctypes.byref(device), d)): + self.n_gpus = 0 + self.gpu_compile_flags = "" + self._use_cuda = False + return (self.n_gpus, self.gpu_compile_flags) + + output = ctypes.c_int() + if not safe_call( + d, + libcuda.cuDeviceGetAttribute( + byref(output), + self.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, + device, + ), + ): + self.n_gpus = 0 + self.gpu_compile_flags = "" + self._use_cuda = False + return (self.n_gpus, self.gpu_compile_flags) + MaxThreadsPerBlock[d] = output.value + + if not safe_call( + d, + libcuda.cuDeviceGetAttribute( + byref(output), + self.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, + device, + ), + ): + self.n_gpus = 0 + self.gpu_compile_flags = "" + self._use_cuda = False + return (self.n_gpus, self.gpu_compile_flags) + SharedMemPerBlock[d] = output.value + + # Build compile flags string #TODO auto detection for macros is here + self.gpu_compile_flags = f"-DMAXIDGPU={self.n_gpus - 1} " + for d in range(self.n_gpus): + self.gpu_compile_flags += ( + f"-DMAXTHREADSPERBLOCK{d}={MaxThreadsPerBlock[d]} " + ) + self.gpu_compile_flags += f"-DSHAREDMEMPERBLOCK{d}={SharedMemPerBlock[d]} " + + return self.n_gpus, self.gpu_compile_flags diff --git a/keopscore/keopscore/include/CudaSizes_win.h b/keopscore/keopscore/include/CudaSizes_win.h new file mode 100644 index 000000000..09862a030 --- /dev/null +++ b/keopscore/keopscore/include/CudaSizes_win.h @@ -0,0 +1,96 @@ +#pragma once + +///////////////////////////////////////////// +// GPU Options // +///////////////////////////////////////////// + + + +// fix some Gpu properties +// CUDA_BLOCK_SIZE gives an upper bound on size of the size of Cuda blocks +// The actual block size may be lower due to memory limitations, depending on the formula used +#ifndef CUDA_BLOCK_SIZE +#define CUDA_BLOCK_SIZE 192 +#endif +// Here we define the maximum number of threads per block and the shared memory per block +// These values can depend on the Gpu, although in fact values 1024 and 49152 respectively +// are the good values for almost all cards. +// So these values should be fine, but you can check them with GetGpuProps.cu program +// Here we assume that: either the user has defined MAXIDGPU (=number of Gpu devices minus one) +// and corresponding specific values MAXTHREADSPERBLOCK0, SHAREDMEMPERBLOCK0, MAXTHREADSPERBLOCK1, SHAREDMEMPERBLOCK1, ... +// for each device, or MAXIDGPU is not defined, and we will use global MAXTHREADSPERBLOCK and SHAREDMEMPERBLOCK +#ifndef MAXIDGPU +// we give default values +#ifndef MAXTHREADSPERBLOCK +#define MAXTHREADSPERBLOCK 1024 +#endif +#ifndef SHAREDMEMPERBLOCK +#define SHAREDMEMPERBLOCK 49152 +#endif +#endif + +// global variables maxThreadsPerBlock and sharedMemPerBlock may depend on the device, so we will set them at each call using +// predefined MAXTHREADSPERBLOCK0, SHAREDMEMPERBLOCK0, MAXTHREADSPERBLOCK1, SHAREDMEMPERBLOCK1, etc. +// through the function SetGpuProps +int maxThreadsPerBlock, sharedMemPerBlock; + +#define SET_GPU_PROPS_MACRO(n) \ + if(device == n) { \ + maxThreadsPerBlock = MAXTHREADSPERBLOCK ## n; \ + sharedMemPerBlock = SHAREDMEMPERBLOCK ## n; \ + return; \ + } + +// I have not managed to use a "recursive macro" hack, it was not compiling on all systems. +// This assumes the number of Gpus is <= 10 ; feel free to add more lines if needed ! +void SetGpuProps(int device) { + +#if defined(MAXTHREADSPERBLOCK) && defined(SHAREDMEMPERBLOCK) + // global values are defined + maxThreadsPerBlock = MAXTHREADSPERBLOCK; + sharedMemPerBlock = SHAREDMEMPERBLOCK; + return; +#else +#if MAXIDGPU >= 0 + SET_GPU_PROPS_MACRO(0) +#endif +#if MAXIDGPU >= 1 + SET_GPU_PROPS_MACRO(1) +#endif +#if MAXIDGPU >= 2 + SET_GPU_PROPS_MACRO(2) +#endif +#if MAXIDGPU >= 3 + SET_GPU_PROPS_MACRO(3) +#endif +#if MAXIDGPU >= 4 + SET_GPU_PROPS_MACRO(4) +#endif +#if MAXIDGPU >= 5 + SET_GPU_PROPS_MACRO(5) +#endif +#if MAXIDGPU >= 6 + SET_GPU_PROPS_MACRO(6) +#endif +#if MAXIDGPU >= 7 + SET_GPU_PROPS_MACRO(7) +#endif +#if MAXIDGPU >= 8 + SET_GPU_PROPS_MACRO(8) +#endif +#if MAXIDGPU >= 9 + SET_GPU_PROPS_MACRO(9) +#endif +#if MAXIDGPU >= 10 + SET_GPU_PROPS_MACRO(10) +#endif +#if MAXIDGPU >= 11 + SET_GPU_PROPS_MACRO(11) +#endif + fprintf( stderr, "invalid Gpu device number. If the number of available Gpus is > 12, add required lines at the end of function SetGpuProps and recompile.\n"); + throw std::runtime_error("[KeOps] Cuda error."); +#endif + +} + + diff --git a/keopscore/keopscore/include/Ranges_win.h b/keopscore/keopscore/include/Ranges_win.h new file mode 100644 index 000000000..19270499e --- /dev/null +++ b/keopscore/keopscore/include/Ranges_win.h @@ -0,0 +1,99 @@ +#pragma once + +#include "Sizes_win.h" + +template< typename TYPE > +class Ranges { +public: + int tagRanges, nranges_x, nranges_y, nredranges_x, nredranges_y; + + std::vector< int > ranges_i, slices_i, redranges_j; + std::vector< int * > _castedranges; + int **castedranges; + + Ranges(Sizes< TYPE > sizes, int **ranges) { + + _castedranges.resize(6); + + // Sparsity: should we handle ranges? ====================================== + if (sizes.nbatchdims == 0) { // Standard M-by-N computation + if (ranges[6][0] == -1) { + tagRanges = 0; + + nranges_x = 0; + nranges_y = 0; + + nredranges_x = 0; + nredranges_y = 0; + + } else { + tagRanges = 1; + nranges_x = ranges[6][0]; + nranges_y = ranges[6][3]; + nredranges_x = ranges[6][5]; + nredranges_y = ranges[6][2]; + + // get the pointers to data to avoid a copy + for (int i = 0; i < 6; i++) { + _castedranges[i] = ranges[i]; + } + } + + } else if (ranges[6][0] == -1) { + // Batch processing: we'll have to generate a custom, block-diagonal sparsity pattern + tagRanges = 1; // Batch processing is emulated through the block-sparse mode + + // Create new "castedranges" from scratch ------------------------------ + // With pythonic notations, we'll have: + // castedranges = (ranges_i, slices_i, redranges_j, ranges_j, slices_j, redranges_i) + // with: + // - ranges_i = redranges_i = [ [0,M], [M,2M], ..., [(nbatches-1)M, nbatches*M] ] + // - slices_i = slices_j = [ 1, 2, ..., nbatches-1, nbatches ] + // - redranges_j = ranges_j = [ [0,N], [N,2N], ..., [(nbatches-1)N, nbatches*N] ] + + + //int ranges_i[2 * sizes.nbatches]; // ranges_i + ranges_i.resize(2 * sizes.nbatches, 0); + + //int slices_i[sizes.nbatches]; // slices_i + slices_i.resize(sizes.nbatches, 0); + + //int redranges_j[2 * sizes.nbatches]; // redranges_j + redranges_j.resize(2 * sizes.nbatches, 0); + + for (int b = 0; b < sizes.nbatches; b++) { + ranges_i[2 * b] = b * sizes.M; + ranges_i[2 * b + 1] = (b + 1) * sizes.M; + slices_i[b] = (b + 1); + redranges_j[2 * b] = b * sizes.N; + redranges_j[2 * b + 1] = (b + 1) * sizes.N; + } + + _castedranges[0] = &ranges_i[0]; + _castedranges[1] = &slices_i[0]; + _castedranges[2] = &redranges_j[0]; + _castedranges[3] = &redranges_j[0]; // ranges_j + _castedranges[4] = &slices_i[0]; // slices_j + _castedranges[5] = &ranges_i[0]; // redranges_i + + nranges_x = sizes.nbatches; + nredranges_x = sizes.nbatches; + nranges_y = sizes.nbatches; + nredranges_y = sizes.nbatches; + } + + castedranges = &_castedranges[0]; + +#if do_checks + else { + throw std::runtime_error( + "[KeOps] The 'ranges' argument (block-sparse mode) is not supported with batch processing, " + "but we detected " + std::to_string(sizes.nbatchdims) + " > 0 batch dimensions." + ); + } +#endif + + + }; + +}; diff --git a/keopscore/keopscore/include/Sizes_win.h b/keopscore/keopscore/include/Sizes_win.h new file mode 100644 index 000000000..c78f30e7c --- /dev/null +++ b/keopscore/keopscore/include/Sizes_win.h @@ -0,0 +1,402 @@ +#pragma once + +#include +#include +#include +#include + + +#define MIN(a, b) (((a)<(b))?(a):(b)) +#define MAX(a, b) (((a)<(b))?(b):(a)) +#define MAX3(a, b, c) (MAX(MAX(a,b),c)) + +#define do_checks 0 +#if do_checks +void error(std::string message) { + throw std::runtime_error(message); +} +#endif + + +#if C_CONTIGUOUS + +int get_val_batch(std::vector< int > _shape, int nbatch, int b) { + return _shape[b]; +} + +#else + +int get_val_batch(std::vector< int > _shape, int nbatch, int b) { + return _shape[nbatch - b]; +} + +#endif + +template< typename TYPE > +class Sizes { +public: + + // attributs + int nargs; + int nx, ny; + int M, N; + int nbatchdims; + int nbatches; + + std::vector< int > _shapes; + int *shapes; + std::vector< int > _shape_out; + int *shape_out; + + int tagIJ; + int use_half; + std::vector< int > indsI; + std::vector< int > indsJ; + std::vector< int > indsP; + int pos_first_argI; + int pos_first_argJ; + int dimout; + int nminargs; + int nvarsI; + int nvarsJ; + int nvarsP; + std::vector< int > dimsX; + std::vector< int > dimsY; + std::vector< int > dimsP; + + // constructors + Sizes(int _nargs, TYPE **args, const std::vector > &argshapes, int _nx, int _ny, + int tagIJ_, int use_half_, int dimout_, + const std::vector< int > &indsI_, std::vector< int > indsJ_, const std::vector< int > &indsP_, + const std::vector< int > &dimsX_, std::vector< int > dimsY_, const std::vector< int > &dimsP_) { + + tagIJ = tagIJ_; + use_half = use_half_; + indsI = indsI_; + indsJ = indsJ_; + indsP = indsP_; + dimout = dimout_; + + nvarsI = indsI.size(); + nvarsJ = indsJ.size(); + nvarsP = indsP.size(); + + pos_first_argI = (nvarsI > 0) ? *std::min_element(indsI.begin(), indsI.end()) : -1; + pos_first_argJ = (nvarsJ > 0) ? *std::min_element(indsJ.begin(), indsJ.end()) : -1; + + int max_i = (nvarsI > 0) ? *std::max_element(indsI.begin(), indsI.end()) : -1; + int max_j = (nvarsJ > 0) ? *std::max_element(indsJ.begin(), indsJ.end()) : -1; + int max_p = (nvarsP > 0) ? *std::max_element(indsP.begin(), indsP.end()) : -1; + + nminargs = 1 + MAX3(max_i, max_j, max_p); + dimsX = dimsX_; + dimsY = dimsY_; + dimsP = dimsP_; + nargs = _nargs; + nx = _nx; + ny = _ny; + + // fill shapes wit "batch dimensions" [A, .., B], the table will look like: + // + // [ A, .., B, M, N, D_out] -> output + // [ A, .., B, M, 1, D_1 ] -> "i" variable + // [ A, .., B, 1, N, D_2 ] -> "j" variable + // [ A, .., B, 1, 1, D_3 ] -> "parameter" + // [ A, .., 1, M, 1, D_4 ] -> N.B.: we support broadcasting on the batch dimensions! + // [ 1, .., 1, M, 1, D_5 ] -> (we'll just ask users to fill in the shapes with *explicit* ones) + fill_shape(nargs, argshapes); + + check_ranges(argshapes); + + // fill shape_out + _shape_out.resize(nbatchdims + 3); + +#if C_CONTIGUOUS + std::copy(_shapes.begin(), _shapes.begin() + nbatchdims + 3, _shape_out.begin());// Copy the "batch dimensions" + _shape_out.erase(_shape_out.begin() + nbatchdims + (1 - tagIJ)); + +#else + std::reverse_copy(_shapes.begin(), _shapes.begin() + nbatchdims + 3, + _shape_out.begin());// Copy the "batch dimensions" + _shape_out.erase(_shape_out.begin() + 1 + tagIJ); + +#endif + + // fill nx and ny + M = _shapes[nbatchdims]; // = M + N = _shapes[nbatchdims + 1]; // = N + + // Compute the product of all "batch dimensions" + nbatches = std::accumulate(_shapes.begin(), _shapes.begin() + nbatchdims, 1, std::multiplies< int >()); + + nx = nbatches * M; // = A * ... * B * M + ny = nbatches * N; // = A * ... * B * N + + shapes = &_shapes[0]; + shape_out = &_shape_out[0]; + } + + + // methods + + void switch_to_half2_indexing(); + +private: + void fill_shape(const int nargs, const std::vector > &argshapes); + + void check_ranges(const std::vector > &argshapes); + + int MN_pos, D_pos; +}; + + +template< typename TYPE > +void Sizes< TYPE >::fill_shape(const int nargs, const std::vector > &argshapes) { + + int pos = std::max(pos_first_argI, pos_first_argJ); + + if (pos > -1) { + // Are we working in batch mode? Infer the answer from the first arg ============= + nbatchdims = argshapes[pos].size() - 2; // number of batch dimensions = Number of dims of the first tensor - 2 + + if (nbatchdims < 0) { +#if do_checks + error("[KeOps] Wrong number of dimensions for arg at position 0: is " + + std::to_string(argshapes[0].size()) + " but should be at least 2." + ); +#endif + } + } else { + nbatchdims = 0; + } + +#if C_CONTIGUOUS + MN_pos = nbatchdims; + D_pos = nbatchdims + 1; +#else + D_pos = 0; + MN_pos = 1; +#endif + + // Now, we'll keep track of the output + all arguments' shapes in a large array: + _shapes.resize((nargs + 1) * (nbatchdims + 3), 1); + + if (use_half) { + if (tagIJ == 0) { + _shapes[nbatchdims] = nx % 2 ? nx + 1 : nx; + _shapes[nbatchdims + 1] = 2 * ny; + } else { + _shapes[nbatchdims] = 2 * nx; + _shapes[nbatchdims + 1] = ny % 2 ? ny + 1 : ny; + } + } else { + _shapes[nbatchdims] = nx; + _shapes[nbatchdims + 1] = ny; + } + + _shapes[nbatchdims + 2] = dimout; // Top right corner: dimension of the output + +} + +template< typename TYPE > +void Sizes< TYPE >::check_ranges(const std::vector > &argshapes) { + + // Check the compatibility of all tensor shapes ================================== + if (nminargs > 0) { + + // Checks args in all the positions that correspond to "i" variables: + for (int k = 0; k < nvarsI; k++) { + int i = indsI[k]; + + // Fill in the (i+1)-th line of the "shapes" array --------------------------- + int off_i = (i + 1) * (nbatchdims + 3); + + // Check the number of dimensions -------------------------------------------- + int ndims = argshapes[i].size(); // Number of dims of the i-th tensor + +#if do_checks + if (ndims != nbatchdims + 2) { + error("[KeOps] Wrong number of dimensions for arg at position " + std::to_string(i) + + " (i type): KeOps detected " + std::to_string(nbatchdims) + + " batch dimensions from the first argument 0, and thus expected " + + std::to_string(nbatchdims + 2) + + " dimensions here, but only received " + + std::to_string(ndims) + + ". Note that KeOps supports broadcasting on batch dimensions, " + + "but still expects 'dummy' unit dimensions in the input shapes, " + + "for the sake of clarity."); + } +#endif + + + + // First, the batch dimensions: + for (int b = 0; b < nbatchdims; b++) { + _shapes[off_i + b] = get_val_batch(argshapes[i], nbatchdims + 2, b); + + // Check that the current value is compatible with what + // we've encountered so far, as stored in the first line of "shapes" + if (_shapes[off_i + b] != 1) { // This dimension is not "broadcasted" + if (_shapes[b] == 1) { + _shapes[b] = _shapes[off_i + b]; // -> it becomes the new standard + } +#if do_checks + else if (_shapes[b] != _shapes[off_i + b]) { + error("[KeOps] Wrong value of the batch dimension " + + std::to_string(b) + " for argument number " + std::to_string(i) + + " : is " + std::to_string(_shapes[off_i + b]) + + " but was " + std::to_string(_shapes[b]) + + " or 1 in previous arguments."); + } +#endif + } + } + + _shapes[off_i + nbatchdims] = argshapes[i][MN_pos]; // = "M" + _shapes[off_i + nbatchdims + 2] = argshapes[i][D_pos]; // = "D" + + +#if do_checks + // Check the number of "lines": + if (_shapes[nbatchdims] != _shapes[off_i + nbatchdims]) { + error("[KeOps] Wrong value of the 'i' dimension " + + std::to_string(nbatchdims) + "for arg at position " + std::to_string(i) + + " : is " + std::to_string(_shapes[off_i + nbatchdims]) + + " but was " + std::to_string(_shapes[nbatchdims]) + + " in previous 'i' arguments."); + } + + // And the number of "columns": + if (_shapes[off_i + nbatchdims + 2] != static_cast< int >(dimsX[k])) { + error("[KeOps] Wrong value of the 'vector size' dimension " + + std::to_string(nbatchdims + 1) + " for arg at position " + std::to_string(i) + + " : is " + std::to_string(_shapes[off_i + nbatchdims + 2]) + + " but should be " + std::to_string(dimsX[k])); + } +#endif + } + + + // Checks args in all the positions that correspond to "j" variables: + for (int k = 0; k < nvarsJ; k++) { + int i = indsJ[k]; + + // Check the number of dimensions -------------------------------------------- + int ndims = argshapes[i].size(); // Number of dims of the i-th tensor + +#if do_checks + if (ndims != nbatchdims + 2) { + error("[KeOps] Wrong number of dimensions for arg at position " + std::to_string(i) + + " (j type): KeOps detected " + std::to_string(nbatchdims) + + " batch dimensions from the first argument 0, and thus expected " + + std::to_string(nbatchdims + 2) + + " dimensions here, but only received " + + std::to_string(ndims) + + ". Note that KeOps supports broadcasting on batch dimensions, " + + "but still expects 'dummy' unit dimensions in the input shapes, " + + "for the sake of clarity."); + } +#endif + + // Fill in the (i+1)-th line of the "shapes" array --------------------------- + int off_i = (i + 1) * (nbatchdims + 3); + + // First, the batch dimensions: + for (int b = 0; b < nbatchdims; b++) { + _shapes[off_i + b] = get_val_batch(argshapes[i], nbatchdims + 2, b); + + // Check that the current value is compatible with what + // we've encountered so far, as stored in the first line of "shapes" + if (_shapes[off_i + b] != 1) { // This dimension is not "broadcasted" + if (_shapes[b] == 1) { + _shapes[b] = _shapes[off_i + b]; // -> it becomes the new standard + } +#if do_checks + else if (_shapes[b] != _shapes[off_i + b]) { + error("[KeOps] Wrong value of the batch dimension " + + std::to_string(b) + " for argument number " + std::to_string(i) + + " : is " + std::to_string(_shapes[off_i + b]) + + " but was " + std::to_string(_shapes[b]) + + " or 1 in previous arguments."); + } +#endif + } + } + + _shapes[off_i + nbatchdims + 1] = argshapes[i][MN_pos]; // = "N" + _shapes[off_i + nbatchdims + 2] = argshapes[i][D_pos]; // = "D" + + +#if do_checks + // Check the number of "lines": + if (_shapes[nbatchdims + 1] != _shapes[off_i + nbatchdims + 1]) { + error("[KeOps] Wrong value of the 'j' dimension " + + std::to_string(nbatchdims) + " for arg at position " + std::to_string(i) + + " : is " + std::to_string(_shapes[off_i + nbatchdims + 1]) + + " but was " + std::to_string(_shapes[nbatchdims + 1]) + + " in previous 'j' arguments."); + } + + // And the number of "columns": + if (_shapes[off_i + nbatchdims + 2] != static_cast< int >(dimsY[k])) { + error("[KeOps] Wrong value of the 'vector size' dimension " + + std::to_string(nbatchdims + 1) + " for arg at position " + std::to_string(i) + + " : is " + std::to_string(_shapes[off_i + nbatchdims + 2]) + + " but should be " + std::to_string(dimsY[k])); + } +#endif + } + + + for (int k = 0; k < nvarsP; k++) { + int i = indsP[k]; + // Fill in the (i+1)-th line of the "shapes" array --------------------------- + int off_i = (i + 1) * (nbatchdims + 3); + // First, the batch dimensions: + for (int b = 0; b < nbatchdims; b++) { + _shapes[off_i + b] = get_val_batch(argshapes[i], nbatchdims + 2, b); + } + _shapes[off_i + nbatchdims + 2] = argshapes[i][nbatchdims]; // = "D" +#if do_checks + int dim_param; + if (use_half) + dim_param = _shapes[off_i + nbatchdims + 2] / 2; + else + dim_param = _shapes[off_i + nbatchdims + 2]; + if (dim_param != static_cast< int >(dimsP[k])) { + error("[KeOps] Wrong value of the 'vector size' dimension " + + std::to_string(nbatchdims) + " for arg at position " + std::to_string(i) + + " : is " + std::to_string(dim_param) + + " but should be " + std::to_string(dimsP[k])); + } +#endif + } + } + +} + +template< typename TYPE > +void Sizes< TYPE >::switch_to_half2_indexing() { + // special case of float16 inputs : because we use half2 type in Cuda codes, we need to divide by two nx, ny, and M, N, or D + // values inside the shapes vector. + nx = nx / 2; + ny = ny / 2; + M = M / 2; + N = N / 2; + _shapes[nbatchdims] = _shapes[nbatchdims] / 2; + _shapes[nbatchdims + 1] = _shapes[nbatchdims + 1] / 2; + for (int i = 0; i < nargs; i++) { + int off_i = (i + 1) * (nbatchdims + 3); + // we don't have anymore the category information... + // the last three dimensions are either of the form (M,1,D), (1,N,D), or (1,1,D) + // where M or N are even in the 2 first cases, or D is even in the third case. + if (_shapes[off_i + nbatchdims] > 1) + _shapes[off_i + nbatchdims] = _shapes[off_i + nbatchdims] / 2; + else if (_shapes[off_i + nbatchdims + 1] > 1) + _shapes[off_i + nbatchdims + 1] = _shapes[off_i + nbatchdims + 1] / 2; + else + _shapes[off_i + nbatchdims + 2] = _shapes[off_i + nbatchdims + 2] / 2; + } +} + diff --git a/keopscore/keopscore/include/ranges_utils_win.h b/keopscore/keopscore/include/ranges_utils_win.h new file mode 100644 index 000000000..372b61c06 --- /dev/null +++ b/keopscore/keopscore/include/ranges_utils_win.h @@ -0,0 +1,82 @@ +#pragma once + + +int broadcast_index(int i, int nbatchdims, int *full_shape, int *shape) { + int M_N = shape[nbatchdims]; + int res = i % M_N, step = M_N, full_step = M_N; + for (int b = nbatchdims; b > 0; b--) { + if (shape[b - 1] != 1) { + res += ((i / full_step) % shape[b - 1]) * step; + } + full_step *= full_shape[b - 1]; + step *= shape[b - 1]; + } + return res; +} + +void vect_broadcast_index(int i, int nbatchdims, int nvars, int *full_shape, + int *reduced_shapes, int *out, int add_offset = 0) { + for (int k = 0; k < nvars; k++) { + out[k] = add_offset + broadcast_index(i, nbatchdims, full_shape, reduced_shapes + (nbatchdims + 1) * k); + } +} + +void fill_shapes(int nbatchdims, int *shapes, int *shapes_i, int *shapes_j, int *shapes_p, + int tagJ, + const std::vector< int > &indsi, + const std::vector< int > &indsj, + const std::vector< int > &indsp) { + + int sizei = indsi.size(); + int sizej = indsj.size(); + int sizep = indsp.size(); + + sizei += 1; + + const int tagIJ = tagJ; // 1 if the reduction is made "over j", 0 if it is made "over i" + + // Separate and store the shapes of the "i" and "j" variables + parameters -------------- + // + // N.B.: If tagIJ == 1, the reduction is made over 'j', which is the default mode. + // However, if tagIJ == 0, the reduction is performed over the 'i' variables: + // since "shape" does not change, we must adapt the adress at which we pick information... + // + // shapes is an array of size (1+nargs)*(nbatchdims+3), which looks like: + // [ A, .., B, M, N, D_out] -> output + // [ A, .., B, M, 1, D_1 ] -> "i" variable + // [ A, .., B, 1, N, D_2 ] -> "j" variable + // [ A, .., B, 1, 1, D_3 ] -> "parameter" + // [ A, .., 1, M, 1, D_4 ] -> N.B.: we support broadcasting on the batch dimensions! + // [ 1, .., 1, M, 1, D_5 ] -> (we'll just ask users to fill in the shapes with *explicit* ones) + + // First, we fill shapes_i with the "relevant" shapes of the "i" variables, + // making it look like, say: + // [ A, .., B, M] + // [ A, .., 1, M] + // [ A, .., A, M] + for (int k = 0; k < (sizei - 1); k++) { // k-th line + for (int l = 0; l < nbatchdims; l++) { // l-th column + shapes_i[k * (nbatchdims + 1) + l] = shapes[(1 + indsi[k]) * (nbatchdims + 3) + l]; + } + shapes_i[k * (nbatchdims + 1) + nbatchdims] = + shapes[(1 + indsi[k]) * (nbatchdims + 3) + nbatchdims + 1 - tagIJ]; + } + + // Then, we do the same for shapes_j, but with "N" instead of "M": + for (int k = 0; k < sizej; k++) { // k-th line + for (int l = 0; l < nbatchdims; l++) { // l-th column + shapes_j[k * (nbatchdims + 1) + l] = shapes[(1 + indsj[k]) * (nbatchdims + 3) + l]; + } + shapes_j[k * (nbatchdims + 1) + nbatchdims] = shapes[(1 + indsj[k]) * (nbatchdims + 3) + nbatchdims + + tagIJ]; + } + + // And finally for the parameters, with "1" instead of "M": + for (int k = 0; k < sizep; k++) { // k-th line + for (int l = 0; l < nbatchdims; l++) { // l-th column + shapes_p[k * (nbatchdims + 1) + l] = shapes[(1 + indsp[k]) * (nbatchdims + 3) + l]; + } + shapes_p[k * (nbatchdims + 1) + nbatchdims] = 1; + } + +} \ No newline at end of file diff --git a/keopscore/keopscore/include/utils_pe.h b/keopscore/keopscore/include/utils_pe.h index 55c91babc..56ef5e99b 100644 --- a/keopscore/keopscore/include/utils_pe.h +++ b/keopscore/keopscore/include/utils_pe.h @@ -1,5 +1,6 @@ #include #include +#include #define NVRTC_SAFE_CALL(x) \ do { \ @@ -68,7 +69,8 @@ void load_args_FromHost(CUdeviceptr &p_data, TYPE *out, TYPE *&out_d, int nargs, TYPE **arg, TYPE **&arg_d, const std::vector> &argshape, signed long int sizeout) { - signed long int sizes[nargs]; + + std::vector sizes(nargs); signed long int totsize = sizeout; for (int k = 0; k < nargs; k++) { sizes[k] = std::accumulate(argshape[k].begin(), argshape[k].end(), 1, @@ -83,7 +85,7 @@ void load_args_FromHost(CUdeviceptr &p_data, TYPE *out, TYPE *&out_d, int nargs, TYPE *dataloc = (TYPE *)(arg_d + nargs); // host array of pointers to device data - TYPE *ph[nargs]; + std::vector ph(nargs); out_d = dataloc; dataloc += sizeout; @@ -95,5 +97,5 @@ void load_args_FromHost(CUdeviceptr &p_data, TYPE *out, TYPE *&out_d, int nargs, } // copy array of pointers - CUDA_SAFE_CALL(cuMemcpyHtoD((CUdeviceptr)arg_d, ph, nargs * sizeof(TYPE *))); + CUDA_SAFE_CALL(cuMemcpyHtoD((CUdeviceptr)arg_d, ph.data(), nargs * sizeof(TYPE *))); } diff --git a/keopscore/keopscore/include/utils_pe_win.h b/keopscore/keopscore/include/utils_pe_win.h new file mode 100644 index 000000000..6e172e476 --- /dev/null +++ b/keopscore/keopscore/include/utils_pe_win.h @@ -0,0 +1,100 @@ +#include +#include + +#define NVRTC_SAFE_CALL(x) \ + do { \ + nvrtcResult result = x; \ + if (result != NVRTC_SUCCESS) { \ + std::cerr << "\nerror: " #x " failed with error " \ + << nvrtcGetErrorString(result) << '\n' << '\n'; \ + throw std::runtime_error("[KeOps] NVRTC error."); \ + } \ + } while(0) + +#define CUDA_SAFE_CALL_NO_EXCEPTION(x) \ + do { \ + CUresult result = x; \ + if (result != CUDA_SUCCESS) { \ + const char *msg; \ + cuGetErrorName(result, &msg); \ + std::cerr << "\n[KeOps] error: " #x " failed with error " \ + << msg << '\n' << '\n'; \ + exit(1); \ + } \ + } while(0) + +#define CUDA_SAFE_CALL(x) \ + do { \ + CUresult result = x; \ + if (result != CUDA_SUCCESS) { \ + const char *msg; \ + cuGetErrorName(result, &msg); \ + std::cerr << "\n[KeOps] error: " #x " failed with error " \ + << msg << '\n' << '\n'; \ + throw std::runtime_error("[KeOps] Cuda error."); \ + } \ + } while(0) + + +char *read_text_file(char const *path) { + char *buffer = 0; + long length; + FILE *f = fopen(path, "rb"); + if (f) { + fseek(f, 0, SEEK_END); + length = ftell(f); + fseek(f, 0, SEEK_SET); + buffer = (char *) malloc((length + 1) * sizeof(char)); + if (buffer) { + int res = fread(buffer, sizeof(char), length, f); + } + fclose(f); + } + buffer[length] = '\0'; + return buffer; +} + + + +template +void load_args_FromDevice(CUdeviceptr &p_data, TYPE *out, TYPE *&out_d, int nargs, TYPE **arg, TYPE **&arg_d) { + CUDA_SAFE_CALL(cuMemAlloc(&p_data, sizeof(TYPE *) * nargs)); + out_d = out; + arg_d = (TYPE **) p_data; + // copy array of pointers + CUDA_SAFE_CALL(cuMemcpyHtoD((CUdeviceptr) arg_d, arg, nargs * sizeof(TYPE *))); +} + + +template +void +load_args_FromHost(CUdeviceptr &p_data, TYPE *out, TYPE *&out_d, int nargs, + TYPE **arg, TYPE **&arg_d, + const std::vector< std::vector< int > > &argshape, + int sizeout) { + std::vector sizes(nargs); + int totsize = sizeout; + for (int k = 0; k < nargs; k++) { + sizes[k] = std::accumulate(argshape[k].begin(), argshape[k].end(), 1, std::multiplies< int >()); + totsize += sizes[k]; + } + + CUDA_SAFE_CALL(cuMemAlloc(&p_data, sizeof(TYPE *) * nargs + sizeof(TYPE) * totsize)); + + arg_d = (TYPE **) p_data; + TYPE *dataloc = (TYPE *) (arg_d + nargs); + + // host array of pointers to device data + std::vector ph(nargs); + + out_d = dataloc; + dataloc += sizeout; + for (int k = 0; k < nargs; k++) { + ph[k] = dataloc; + CUDA_SAFE_CALL(cuMemcpyHtoD((CUdeviceptr) dataloc, arg[k], sizeof(TYPE) * sizes[k])); + dataloc += sizes[k]; + } + + // copy array of pointers + CUDA_SAFE_CALL(cuMemcpyHtoD((CUdeviceptr) arg_d, ph.data(), nargs * sizeof(TYPE *))); +} diff --git a/keopscore/keopscore/windows_compilations/__init__.py b/keopscore/keopscore/windows_compilations/__init__.py new file mode 100644 index 000000000..cc7a91321 --- /dev/null +++ b/keopscore/keopscore/windows_compilations/__init__.py @@ -0,0 +1,28 @@ +"""Initialize the windows_compilations package and create dictionaries with found includes/libs/dlls""" + +from .compile import compile +from .compile_nvrtc_jit import compile_nvrtc_jit +from .compile_pykeops_cpp_module import compile_pykeops_cpp_module +from .compile_pykeops_nvrtc import compile_pykeops_nvrtc +from .detection import ( + dlls, + include_dirs, + keops_available, + lib_dirs, + lib_names, +) +from .globals import tmp_dir +from .cuda_detection import cuda_available + +__all__ = [ + "compile", + "tmp_dir", + "include_dirs", + "lib_dirs", + "lib_names", + "dlls", + "compile_pykeops_cpp_module", + "compile_nvrtc_jit", + "compile_pykeops_nvrtc", + "keops_available", +] diff --git a/keopscore/keopscore/windows_compilations/compile.py b/keopscore/keopscore/windows_compilations/compile.py new file mode 100644 index 000000000..d01699e4a --- /dev/null +++ b/keopscore/keopscore/windows_compilations/compile.py @@ -0,0 +1,134 @@ +from __future__ import annotations + +import os +import shutil +import sysconfig +import uuid +from pathlib import Path + +from .globals import tmp_dir + +_empty_list = [] + + +def compile( + source_file: os.PathLike, + project_name: str | None = None, + includes: list[os.PathLike] | None = _empty_list, + link_dirs: list[os.PathLike] | None = _empty_list, + links: list[str] | None = _empty_list, + macros: list[str] | None = _empty_list, + suffix: str = ".dll", + output_dir=".", + print_cmakelists=False, + show_cmake_commands_output=False, + clean_tmp_build_dir=True, +): + + output_dir = Path(output_dir) + + if project_name is None: + project_name = "".join(Path(source_file).name.split(".")[:-1]) + + includes_str = "" + for include in includes: + includes_str += f'include_directories("{include!s}")\n' + includes_str = includes_str.replace("\\", "/") + + link_dirs_str = "" + for link in link_dirs: + link_dirs_str += f'link_directories("{link!s}")\n' + link_dirs_str = link_dirs_str.replace("\\", "/") + + macros_str = "" + for macro in macros: + macros_str += f"add_definitions({macro!s})\n" + + if len(links) == 0: + links = "" + + else: + inline_links = " ".join(links) + links = f"target_link_libraries(${{PROJECT_NAME}} {inline_links})" + + with Path.open(Path(__file__).parent / "templates" / "CMakeLists.txt") as f: + content = f.read() + + includes = includes_str + link_dirs = link_dirs_str + macros = macros_str + source_file = str(Path(source_file).resolve()).replace("\\", "/") + + if "pyd" in suffix: + suffix = sysconfig.get_config_var("EXT_SUFFIX") + + fields = [ + "source_file", + "project_name", + "includes", + "link_dirs", + "links", + "macros", + "suffix", + ] + + for field in fields: + content = content.replace(f"***{field}***", locals()[field]) + + import os + import subprocess + + cwd = Path.cwd() + + # Go to tmp dir + + tmp_build = tmp_dir / str(uuid.uuid4()) + + tmp_build.mkdir() + if (tmp_build / "build").is_dir(): + shutil.rmtree(tmp_build / "build") + + tmp_build.mkdir(exist_ok=True) + + with Path.open(Path(tmp_build) / "CMakeLists.txt", "w") as f: + f.write(content) + + if print_cmakelists: + print(content) + + os.chdir(tmp_build) + Path("build").mkdir() + os.chdir("build") + + if not show_cmake_commands_output: + + with Path.open(tmp_build / "log", "w") as log_file: + + subprocess.check_call(["cmake", ".."], stdout=log_file) + subprocess.check_call( + ["cmake", "--build", ".", "--config", "Release"], stdout=log_file + ) + + else: + subprocess.check_call(["cmake", ".."]) + subprocess.check_call(["cmake", "--build", ".", "--config", "Release"]) + + # Back to previous working directory + os.chdir(cwd) + + # Define the source and destination directories + Path(output_dir).mkdir(exist_ok=True) + + source_dir = tmp_build / "build" / "Release" + + # Copy the contents of the source directory to the destination directory + for item in os.listdir(source_dir): + s = source_dir / item + d = output_dir / item + + # Copy files or directories + if str(s).endswith(suffix): + shutil.copy2(s, d) + + if clean_tmp_build_dir: + shutil.rmtree(tmp_build) diff --git a/keopscore/keopscore/windows_compilations/compile_nvrtc_jit.py b/keopscore/keopscore/windows_compilations/compile_nvrtc_jit.py new file mode 100644 index 000000000..7227bdd59 --- /dev/null +++ b/keopscore/keopscore/windows_compilations/compile_nvrtc_jit.py @@ -0,0 +1,38 @@ +from pathlib import Path + +from .compile import compile +from .detection import ( + include_dirs, + lib_dirs, + lib_names, +) +from .utils import find_package_location + + +def compile_nvrtc_jit(build_folder): + + keops_dir = Path(find_package_location("keopscore")).parent + source_file = keops_dir / "binders" / "nvrtc" / "nvrtc_jit_win.cpp" + + macros = [ + "-DMAXIDGPU=0", + "-DMAXTHREADSPERBLOCK0=1024", + "-DSHAREDMEMPERBLOCK0=49152", + "-DnvrtcGetTARGET=nvrtcGetCUBIN", + "-DnvrtcGetTARGETSize=nvrtcGetCUBINSize", + '-DARCHTAG="sm"', + ] + + compile( + source_file=source_file, + project_name="nvrtc_jit", + macros=macros, + includes=[include_dirs[key] for key in ["keops", "cuda"]], + link_dirs=[lib_dirs[key] for key in ["cuda"]], + links=[lib_names[key] for key in ["cuda", "nvrtc", "cudart"]], + suffix=".dll", + output_dir=build_folder, + print_cmakelists=False, + show_cmake_commands_output=False, + clean_tmp_build_dir=False, + ) diff --git a/keopscore/keopscore/windows_compilations/compile_pykeops_cpp_module.py b/keopscore/keopscore/windows_compilations/compile_pykeops_cpp_module.py new file mode 100644 index 000000000..47c8f56ba --- /dev/null +++ b/keopscore/keopscore/windows_compilations/compile_pykeops_cpp_module.py @@ -0,0 +1,31 @@ +from .compile import compile +from .detection import include_dirs, lib_dirs, lib_names + + +def compile_pykeops_cpp_module(source_file, build_folder): + """This built-in compilation config serves to compile pykeops cpp modules + + Parameters + ---------- + source_file + Location of the source cpp file + build_folder + The KeOps build folder + + """ + + compile( + source_file=source_file, + includes=[ + include_dirs["python"], + include_dirs["pybind11"], + include_dirs["keops"], + ], + link_dirs=[lib_dirs["python"]], + links=[lib_names["python"]], + suffix=".pyd", + output_dir=build_folder, + print_cmakelists=False, + show_cmake_commands_output=False, + clean_tmp_build_dir=False, + ) diff --git a/keopscore/keopscore/windows_compilations/compile_pykeops_nvrtc.py b/keopscore/keopscore/windows_compilations/compile_pykeops_nvrtc.py new file mode 100644 index 000000000..361f00362 --- /dev/null +++ b/keopscore/keopscore/windows_compilations/compile_pykeops_nvrtc.py @@ -0,0 +1,38 @@ +from pathlib import Path + +from .compile import compile +from .detection import ( + include_dirs, + lib_dirs, + lib_names, +) +from .utils import find_package_location + + +def compile_pykeops_nvrtc(build_folder): + + pykeops_dir = Path(find_package_location("pykeops")).parent + source_file = pykeops_dir / "common" / "keops_io" / "pykeops_nvrtc_win.cpp" + + macros = [ + "-DMAXIDGPU=0", # TODO should be detected + "-DMAXTHREADSPERBLOCK0=1024", # TODO should be detected + "-DSHAREDMEMPERBLOCK0=49152", # TODO should be detected + "-DnvrtcGetTARGET=nvrtcGetCUBIN", # TODO should be detected + "-DnvrtcGetTARGETSize=nvrtcGetCUBINSize", # TODO should be detected + '-DARCHTAG="sm"', # TODO should be detected + ] + + compile( + source_file=source_file, + project_name="pykeops_nvrtc", + macros=macros, + includes=[include_dirs[key] for key in ["python", "pybind11", "keops", "cuda"]], + link_dirs=[lib_dirs[key] for key in ["python", "cuda"]], + links=[lib_names[key] for key in ["cuda", "nvrtc", "cudart", "python"]], + suffix=".pyd", + output_dir=build_folder, + print_cmakelists=False, + show_cmake_commands_output=False, + clean_tmp_build_dir=False, + ) diff --git a/keopscore/keopscore/windows_compilations/cuda_detection.py b/keopscore/keopscore/windows_compilations/cuda_detection.py new file mode 100644 index 000000000..3f4b8f63d --- /dev/null +++ b/keopscore/keopscore/windows_compilations/cuda_detection.py @@ -0,0 +1,76 @@ +r""" +CUDA toolkit detection on Windows. + +CUDA_PATH environment variable must be set. It is usually set by the CUDA installer, if not it +must point to a valid CUDA installation (typically C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\vX.Y). + +The detection looks for the following files: +- cudart*.dll +- nvrtc-builtins*.dll +- nvcuda.dll (CUDA driver library, usually located in system32 folder) +- include directory +- lib/x64 directory containing cuda.lib, nvrtc.lib and cudart.lib +""" + +import os +from ctypes.util import find_library +from pathlib import Path + +cuda_available = "CUDA_PATH" in os.environ + + +def detect_cuda_toolkit(): + + output = {} + + if cuda_available: + + cuda_path = Path( + os.environ["CUDA_PATH"] + ) # base path for cuda installation (including bin, lib, include, etc.) + + if find_library("nvcuda") is not None: # NVCUDA is the main CUDA driver library + output["dll_cuda"] = find_library("nvcuda") + + cuda_path = Path( + os.environ["CUDA_PATH"] + ) # base path for cuda installation (including bin, lib, include, etc.) + + ################################################# + # Detect relevant DLLs: cudart and nvrtc-builtins + ################################################# + + # Check both bin and bin/x64 directories for relevant DLLs + bin_dirs = [Path(cuda_path, "bin"), Path(cuda_path, "bin", "x64")] + + for bin_dir in bin_dirs: + if bin_dir.is_dir(): + for file in bin_dir.iterdir(): + if file.name.startswith("cudart") and file.name.endswith(".dll"): + output["dll_cudart"] = str(file) + if file.name.startswith("nvrtc-builtins") and file.name.endswith( + ".dll" + ): + output["dll_nvrtc"] = str(file) + + ################################################# + # Detect include and lib directories + ################################################# + cuda_include = Path(cuda_path, "include") + if cuda_include.is_dir(): + output["include_dir"] = str(cuda_include) + + cuda_libs = Path(cuda_path, "lib", "x64") + if cuda_libs.is_dir(): + output["lib_dirs"] = str(cuda_libs) + + ################################################# + # Make sure that cudart, nvrtc and cuda libs are available + ################################################# + output["lib_names"] = {} + for key in ["cuda", "nvrtc", "cudart"]: + + if (cuda_libs / (key + ".lib")).is_file(): + output["lib_names"][key] = key + + return output diff --git a/keopscore/keopscore/windows_compilations/cuda_detection/CMakeLists.txt b/keopscore/keopscore/windows_compilations/cuda_detection/CMakeLists.txt new file mode 100644 index 000000000..30cca9752 --- /dev/null +++ b/keopscore/keopscore/windows_compilations/cuda_detection/CMakeLists.txt @@ -0,0 +1,127 @@ +cmake_minimum_required(VERSION 3.21) +project(cuda_detection LANGUAGES CXX) + +# If you have multiple toolkits, pass -DCUDAToolkit_ROOT="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.6" +find_package(CUDAToolkit REQUIRED) + +# --- Derive include and lib dirs ------------------------------------------------ +# include_dir: usually one path +list(GET CUDAToolkit_INCLUDE_DIRS 0 CUDA_DETECT_INCLUDE_DIR) + +# lib_dirs: prefer .../lib/x64 if present, else CUDAToolkit_LIBRARY_DIR +set(CUDA_DETECT_LIB_DIR "${CUDAToolkit_LIBRARY_DIR}") +if(EXISTS "${CUDAToolkit_LIBRARY_DIR}/x64") + set(CUDA_DETECT_LIB_DIR "${CUDAToolkit_LIBRARY_DIR}/x64") +endif() + +# bin dir for DLLs +set(_BIN "${CUDAToolkit_BIN_DIR}") + +# --- Version tag for NVRTC-builtins (12.6 -> 126) ------------------------------ +string(REPLACE "." ";" _ver_list "${CUDAToolkit_VERSION}") +list(GET _ver_list 0 _maj) +list(LENGTH _ver_list _len) +if(_len GREATER 1) + list(GET _ver_list 1 _min) +else() + set(_min 0) +endif() +math(EXPR _TAG "${_maj} * 10 + ${_min}") + +# --- dll_cuda (driver DLL) ----------------------------------------------------- +# Prefer System32 (64-bit) +find_file(CUDA_DETECT_DLL_CUDA + NAMES nvcuda.dll + PATHS "C:/Windows/System32" "C:/Windows/SysWOW64" # fallback shows 32-bit if only that exists + NO_DEFAULT_PATH) + +# --- dll_cudart (runtime DLL) -------------------------------------------------- +# Try versioned names first, then any cudart64*.dll in the toolkit bin +set(CUDA_DETECT_DLL_CUDART "") +foreach(_cand + "cudart64_${_maj}${_min}.dll" + "cudart64.dll") + find_file(CUDA_DETECT_DLL_CUDART NAMES "${_cand}" HINTS "${_BIN}" NO_DEFAULT_PATH) + if(CUDA_DETECT_DLL_CUDART) + break() + endif() +endforeach() +if(NOT CUDA_DETECT_DLL_CUDART) + file(GLOB _cudart_glob LIST_DIRECTORIES OFF "${_BIN}/cudart64*.dll") + if(_cudart_glob) + list(GET _cudart_glob 0 CUDA_DETECT_DLL_CUDART) + endif() +endif() + +# --- dll_nvrtc (you asked for the *builtins* DLL specifically) ----------------- +# Search for nvrtc-builtins matching our tag, else first available +set(CUDA_DETECT_DLL_NVRTC "") +file(GLOB _builtins_glob LIST_DIRECTORIES OFF "${_BIN}/nvrtc-builtins*.dll") +if(_builtins_glob) + foreach(_dll IN LISTS _builtins_glob) + get_filename_component(_name "${_dll}" NAME) + if(_name MATCHES "nvrtc-builtins.*${_TAG}") + set(CUDA_DETECT_DLL_NVRTC "${_dll}") + break() + endif() + endforeach() + if(NOT CUDA_DETECT_DLL_NVRTC) + list(GET _builtins_glob 0 CUDA_DETECT_DLL_NVRTC) + endif() +endif() + +# --- Sanity hints (optional warnings) ----------------------------------------- +if(NOT CUDA_DETECT_DLL_CUDA) + message(WARNING "nvcuda.dll not found in System32/SysWOW64.") +endif() +if(NOT CUDA_DETECT_DLL_CUDART) + message(WARNING "cudart64*.dll not found in ${_BIN}.") +endif() +if(NOT CUDA_DETECT_DLL_NVRTC) + message(WARNING "nvrtc-builtins*.dll not found in ${_BIN}.") +endif() + +# --- Produce outputs in your desired schema ----------------------------------- +# lib_names are the import-library basenames +set(_LIB_NAMES_PY "{'cuda': 'cuda', 'cudart': 'cudart', 'nvrtc': 'nvrtc'}") + +# Escape backslashes for Python/JSON strings +function(_escape_for_py in_var out_var) + string(REPLACE "\\" "\\\\" _tmp "${${in_var}}") + set(${out_var} "${_tmp}" PARENT_SCOPE) +endfunction() + +_escape_for_py(CUDA_DETECT_DLL_CUDA _DLL_CUDA_PY) +_escape_for_py(CUDA_DETECT_DLL_CUDART _DLL_CUDART_PY) +_escape_for_py(CUDA_DETECT_DLL_NVRTC _DLL_NVRTC_PY) +_escape_for_py(CUDA_DETECT_INCLUDE_DIR _INCLUDE_PY) +_escape_for_py(CUDA_DETECT_LIB_DIR _LIBDIR_PY) + +# Python-style dict (exact key names as you requested) +set(_DICT_PY + "{'dll_cuda': '${_DLL_CUDA_PY}', " + "'dll_cudart': '${_DLL_CUDART_PY}', " + "'dll_nvrtc': '${_DLL_NVRTC_PY}', " + "'include_dir': '${_INCLUDE_PY}', " + "'lib_dirs': '${_LIBDIR_PY}', " + "'lib_names': ${_LIB_NAMES_PY}}" +) + +message(STATUS "CUDA_DETECTION=${_DICT_PY}") + +# Also emit a JSON file (cuda_detect.json) in the build dir for programmatic use +# Note: using the same values; JSON requires quoted keys and escaped paths. +set(_JSON + "{\n" + " \"dll_cuda\": \"${_DLL_CUDA_PY}\",\n" + " \"dll_cudart\": \"${_DLL_CUDART_PY}\",\n" + " \"dll_nvrtc\": \"${_DLL_NVRTC_PY}\",\n" + " \"include_dir\": \"${_INCLUDE_PY}\",\n" + " \"lib_dirs\": \"${_LIBDIR_PY}\",\n" + " \"lib_names\": {\"cuda\": \"cuda\", \"cudart\": \"cudart\", \"nvrtc\": \"nvrtc\"}\n" + "}\n") +file(WRITE "${CMAKE_BINARY_DIR}/cuda_detect.json" "${_JSON}") + +# Dummy target so configure+build does something +add_custom_target(show_detect ALL + COMMENT "CUDA detection complete. See message above or cuda_detect.json.") diff --git a/keopscore/keopscore/windows_compilations/detection.py b/keopscore/keopscore/windows_compilations/detection.py new file mode 100644 index 000000000..059d3c9fe --- /dev/null +++ b/keopscore/keopscore/windows_compilations/detection.py @@ -0,0 +1,62 @@ +import sys +import sysconfig +from pathlib import Path + +import pybind11 + +from .cuda_detection import detect_cuda_toolkit, cuda_available +from .utils import find_package_location + +include_dirs = {} +lib_dirs = {} +lib_names = {} +dlls = {} + + +try: + location_keops_init = find_package_location("keopscore") + include_dirs["keops"] = str(Path(location_keops_init).parent) + keops_available = True +except ImportError: + keops_available = False + + +include_dirs["pybind11"] = pybind11.get_include() + + +# Python +include_dirs["python"] = sysconfig.get_path("include") + +if sys.platform == "win32": + # On Windows, get the path to the Python DLL + python_libs = Path(sysconfig.get_path("include")).parent / "libs" + + # Get the path to the standard library (Lib) + if python_libs.is_dir(): + lib_dirs["python"] = str(python_libs) + + version = str(sys.version_info.major) + str(sys.version_info.minor) + if (Path(lib_dirs["python"]) / ("python" + version + ".lib")).is_file(): + lib_names["python"] = "python" + version +else: + # On Unix-like systems, use sysconfig to get the library directory and name + lib_dirs["python"] = Path(sysconfig.get_config_var("LIBDIR")) + lib_names["python"] = sysconfig.get_config_var("LDLIBRARY") + + +# Cuda +if cuda_available: + cuda_config = detect_cuda_toolkit() + for key in ["cuda", "nvrtc", "cudart"]: + + if f"dll_{key}" in cuda_config: + dlls[key] = cuda_config[f"dll_{key}"] + + if key in cuda_config["lib_names"]: + lib_names[key] = cuda_config["lib_names"][key] + + if "include_dir" in cuda_config: + include_dirs["cuda"] = cuda_config["include_dir"] + + if "lib_dirs" in cuda_config: + lib_dirs["cuda"] = cuda_config["lib_dirs"] diff --git a/keopscore/keopscore/windows_compilations/globals.py b/keopscore/keopscore/windows_compilations/globals.py new file mode 100644 index 000000000..9eb9504c8 --- /dev/null +++ b/keopscore/keopscore/windows_compilations/globals.py @@ -0,0 +1,5 @@ +from pathlib import Path + +tmp_dir = Path.home() / "tmp" / "cppcompile_windows" +(Path.home() / "tmp").mkdir(exist_ok=True) +tmp_dir.mkdir(exist_ok=True) diff --git a/keopscore/keopscore/windows_compilations/templates/CMakeLists.txt b/keopscore/keopscore/windows_compilations/templates/CMakeLists.txt new file mode 100644 index 000000000..458a6b5af --- /dev/null +++ b/keopscore/keopscore/windows_compilations/templates/CMakeLists.txt @@ -0,0 +1,39 @@ +cmake_minimum_required(VERSION 3.10) +project(***project_name***) + +# Set C++ standard +set(CMAKE_CXX_STANDARD 11) + +# Set compiler options +if(MSVC) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /LD /Ox /GL /openmp") +else() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -O3 -fopenmp") +endif() + +# Preprocessor macros +***macros*** + +# Include directories +***includes*** + +# Link directories +***link_dirs*** + +# Source file +set(SOURCE_FILE "***source_file***") + +# Add shared library +add_library(${PROJECT_NAME} SHARED ${SOURCE_FILE}) + +# Link +***links*** + +# Set the output path for the shared library and ensure the .pyd extension +set_target_properties(${PROJECT_NAME} PROPERTIES + LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}" + OUTPUT_NAME "***project_name***" + SUFFIX "***suffix***" # Ensure the correct file extension for Python modules + MSVC_WARNING_LEVEL 0 + MSVC_WARNING_DISABLE 8029 # Warning because we are building in a tmp dir +) diff --git a/keopscore/keopscore/windows_compilations/utils.py b/keopscore/keopscore/windows_compilations/utils.py new file mode 100644 index 000000000..6a005d08e --- /dev/null +++ b/keopscore/keopscore/windows_compilations/utils.py @@ -0,0 +1,31 @@ +import importlib.util + + +def find_package_location(package_name: str) -> str: + """Find the __init__ file of a given package + + This function does not import the package, it was written to avoid circular + imports with KeOps + + Parameters + ---------- + package_name + The name of the package + + Returns + ------- + str + The path to the package + + Raises + ------ + ImportError + If the package cannot be loaded + + """ + spec = importlib.util.find_spec(package_name) + if spec.origin: + return spec.origin + else: + message = f"Package '{package_name}' not found." + raise ImportError(message) diff --git a/keopscore/setup.py b/keopscore/setup.py index a0854b4a2..77c9eaaa8 100644 --- a/keopscore/setup.py +++ b/keopscore/setup.py @@ -11,6 +11,16 @@ with open(os.path.join(here, "keopscore", "keops_version"), encoding="utf-8") as v: current_version = v.read().rstrip() +# TODO fix this (issues with symlinks on windows ? -> moving to pyproject.toml ?) +if os.name == "nt": + with open(os.path.join(here, "..", "keops_version"), encoding="utf-8") as v: + current_version = v.read().rstrip() + # copy the content to keopscore/keops_version + with open( + os.path.join(here, "keopscore", "keops_version"), "w", encoding="utf-8" + ) as v: + v.write(current_version) + # Get the long description from the README file with open(path.join(here, "keopscore", "readme.md"), encoding="utf-8") as f: long_description = f.read() @@ -59,6 +69,7 @@ "keopscore.mapreduce.cpu", "keopscore.mapreduce.gpu", "keopscore.utils", + "keopscore.windows_compilations", ], package_data={ "keopscore": [ @@ -73,8 +84,16 @@ "include/Ranges.h", "include/Sizes.h", "include/utils_pe.h", + "binders/nvrtc/keops_nvrtc_win.cpp", + "binders/nvrtc/nvrtc_jit_win.cpp", + "include/CudaSizes_win.h", + "include/ranges_utils_win.h", + "include/Ranges_win.h", + "include/Sizes_win.h", + "include/utils_pe_win.h", + "windows_compilations/templates/CMakeLists.txt", ], }, - install_requires=[], + install_requires=["cmake"], extras_require={}, ) diff --git a/pykeops/pykeops/common/keops_io/LoadKeOps_cpp.py b/pykeops/pykeops/common/keops_io/LoadKeOps_cpp.py index 7b632f3c9..961568023 100644 --- a/pykeops/pykeops/common/keops_io/LoadKeOps_cpp.py +++ b/pykeops/pykeops/common/keops_io/LoadKeOps_cpp.py @@ -11,6 +11,8 @@ from keopscore.utils.misc_utils import KeOps_OS_Run from pykeops.config import pykeops_cpp_name, python_includes +# TODO limit code duplication for get_pybind11_code() + class LoadKeOps_cpp_class(LoadKeOps): def __init__(self, *args, fast_init=False): @@ -33,7 +35,16 @@ def init_phase1(self): flush=True, end="", ) - KeOps_OS_Run(compile_command) + + if os.name == "nt": + from keopscore.windows_compilations import compile_pykeops_cpp_module + + compile_pykeops_cpp_module( + source_file=srcname, + build_folder=get_build_folder(), + ) + else: + KeOps_OS_Run(compile_command) pyKeOps_Message("OK", use_tag=False, flush=True) def init_phase2(self): @@ -70,7 +81,8 @@ def call_keops(self, nx, ny): ) def get_pybind11_code(self): - return f""" + if os.name != "nt": + return f""" #include "{self.params.source_name}" #include @@ -152,7 +164,7 @@ def get_pybind11_code(self): }} - return launch_keops_cpu_{self.params.tag}< TYPE >(dimY, + return launch_keops_cpu_{self.params.tag} < TYPE >(dimY, nx, ny, tagI, @@ -181,6 +193,120 @@ def get_pybind11_code(self): }} """ + else: + return f""" +#include "{self.params.source_name}" + +#include +namespace py = pybind11; + +template < typename TYPE > +int launch_pykeops_{self.params.tag}_cpu(int dimY, int nx, int ny, + int tagI, int tagZero, int use_half, + int dimred, + int use_chunk_mode, + py::tuple py_indsi, py::tuple py_indsj, py::tuple py_indsp, + int dimout, + py::tuple py_dimsx, py::tuple py_dimsy, py::tuple py_dimsp, + py::tuple py_ranges, + py::tuple py_shapeout, + long out_void, + py::tuple py_arg, + py::tuple py_argshape){{ + + /*------------------------------------*/ + /* Cast input args */ + /*------------------------------------*/ + + std::vector< int > indsi_v(py_indsi.size()); + for (auto i = 0; i < py_indsi.size(); i++) + indsi_v[i] = py::cast< int >(py_indsi[i]); + + + std::vector< int > indsj_v(py_indsj.size()); + for (auto i = 0; i < py_indsj.size(); i++) + indsj_v[i] = py::cast< int >(py_indsj[i]); + + + std::vector< int > indsp_v(py_indsp.size()); + for (auto i = 0; i < py_indsp.size(); i++) + indsp_v[i] = py::cast< int >(py_indsp[i]); + + + std::vector< int > dimsx_v(py_dimsx.size()); + for (auto i = 0; i < py_dimsx.size(); i++) + dimsx_v[i] = py::cast< int >(py_dimsx[i]); + + + std::vector< int > dimsy_v(py_dimsy.size()); + for (auto i = 0; i < py_dimsy.size(); i++) + dimsy_v[i] = py::cast< int >(py_dimsy[i]); + + + std::vector< int > dimsp_v(py_dimsp.size()); + for (auto i = 0; i < py_dimsp.size(); i++) + dimsp_v[i] = py::cast< int >(py_dimsp[i]); + + + // Cast the ranges arrays + std::vector< int* > ranges_v(py_ranges.size()); + for (int i = 0; i < py_ranges.size(); i++) + ranges_v[i] = (int*) py::cast< long >(py_ranges[i]); + int **ranges = (int**) ranges_v.data(); + + std::vector< int > shapeout_v(py_shapeout.size()); + for (auto i = 0; i < py_shapeout.size(); i++) + shapeout_v[i] = py::cast< int >(py_shapeout[i]); + + TYPE *out = (TYPE*) out_void; + // std::cout << "out_ptr : " << (long) out << std::endl; + + std::vector< TYPE* > arg_v(py_arg.size()); + for (int i = 0; i < py_arg.size(); i++) + arg_v[i] = (TYPE*) py::cast< long >(py_arg[i]); + TYPE **arg = (TYPE**) arg_v.data(); + + std::vector< std::vector< int > > argshape_v(py_argshape.size()); + for (auto i = 0; i < py_argshape.size(); i++){{ + py::tuple tmp = py_argshape[i]; + std::vector< int > tmp_v(tmp.size()); + for (auto j =0; j < tmp.size(); j++) + tmp_v[j] = py::cast< int >(tmp[j]); + argshape_v[i] = tmp_v; + }} + + + return launch_keops_cpu_{self.params.tag}< TYPE >(dimY, + nx, + ny, + tagI, + tagZero, + use_half, + dimred, + use_chunk_mode, + indsi_v, + indsj_v, + indsp_v, + dimout, + dimsx_v, + dimsy_v, + dimsp_v, + ranges, + shapeout_v, + out, + arg, + argshape_v); + +}} + +PYBIND11_MODULE(pykeops_cpp_{self.params.tag}, m) {{ + m.doc() = "pyKeOps: KeOps for pytorch through pybind11 (pytorch flavour)."; + m.def("launch_pykeops_cpu", &launch_pykeops_{self.params.tag}_cpu < {cpp_dtype[self.params.dtype]} >, "Entry point to keops."); +}} + """.replace( + "long", "int64_t" + ) + LoadKeOps_cpp = Cache_partial( LoadKeOps_cpp_class, use_cache_file=True, save_folder=get_build_folder() diff --git a/pykeops/pykeops/common/keops_io/LoadKeOps_nvrtc.py b/pykeops/pykeops/common/keops_io/LoadKeOps_nvrtc.py index 95ef25489..651d8d9b7 100644 --- a/pykeops/pykeops/common/keops_io/LoadKeOps_nvrtc.py +++ b/pykeops/pykeops/common/keops_io/LoadKeOps_nvrtc.py @@ -66,7 +66,7 @@ def call_keops(self, nx, ny): self.params.dimsp, self.ranges_ptr_new, self.outshape, - self.out_ptr, + self.out_ptr if os.name != "nt" else (self.out_ptr,), self.args_ptr_new, self.argshapes_new, ) @@ -85,7 +85,13 @@ def compile_jit_binary(): dllname=pykeops.config.pykeops_nvrtc_name(type="target"), ) pyKeOps_Message("Compiling nvrtc binder for python ... ", flush=True, end="") - KeOps_OS_Run(compile_command) + if os.name == "nt": + from keopscore.windows_compilations import compile_pykeops_nvrtc + + compile_pykeops_nvrtc(build_folder=get_build_folder()) + + else: + KeOps_OS_Run(compile_command) pyKeOps_Message("OK", use_tag=False, flush=True) diff --git a/pykeops/pykeops/common/keops_io/pykeops_nvrtc_win.cpp b/pykeops/pykeops/common/keops_io/pykeops_nvrtc_win.cpp new file mode 100644 index 000000000..20d8c3d3b --- /dev/null +++ b/pykeops/pykeops/common/keops_io/pykeops_nvrtc_win.cpp @@ -0,0 +1,148 @@ +// g++ -std=c++11 -shared -fPIC -O3 -fpermissive -lcuda -lnvrtc -L/usr/lib -L/opt/cuda/lib64 -L/opt/cuda/targets/x86_64-linux/lib/ -I/opt/cuda/targets/x86_64-linux/include/ -I /home/bcharlier/projets/keops/keops/keopscore/ -I/usr/include/python3.10/ -DnvrtcGetTARGET=nvrtcGetCUBIN -DnvrtcGetTARGETSize=nvrtcGetCUBINSize -DARCHTAG=\"sm\" keops_io.cpp -o keops_io.cpython-310-x86_64-linux-gnu.so + + +#include +#include + +namespace py = pybind11; + +template< typename TYPE > +class KeOps_module_python : public KeOps_module< TYPE > { +public: + + using KeOps_module< TYPE >::KeOps_module; + + + int operator()(int tagHostDevice, int dimY, int nx, int ny, + int tagI, int tagZero, int use_half, + int tag1D2D, int dimred, + int cuda_block_size, int use_chunk_mode, + py::tuple py_indsi, py::tuple py_indsj, py::tuple py_indsp, + int dimout, + py::tuple py_dimsx, py::tuple py_dimsy, py::tuple py_dimsp, + py::tuple py_ranges, + py::tuple py_shapeout, + py::tuple out_void, + py::tuple py_arg, + py::tuple py_argshape + ) { + + /*------------------------------------*/ + /* Cast input args */ + /*------------------------------------*/ + + std::vector< int > indsi_v(py_indsi.size()); + for (auto i = 0; i < py_indsi.size(); i++) + indsi_v[i] = py::cast< int >(py_indsi[i]); + + + std::vector< int > indsj_v(py_indsj.size()); + for (auto i = 0; i < py_indsj.size(); i++) + indsj_v[i] = py::cast< int >(py_indsj[i]); + + + std::vector< int > indsp_v(py_indsp.size()); + for (auto i = 0; i < py_indsp.size(); i++) + indsp_v[i] = py::cast< int >(py_indsp[i]); + + + std::vector< int > dimsx_v(py_dimsx.size()); + for (auto i = 0; i < py_dimsx.size(); i++) + dimsx_v[i] = py::cast< int >(py_dimsx[i]); + + + std::vector< int > dimsy_v(py_dimsy.size()); + for (auto i = 0; i < py_dimsy.size(); i++) + dimsy_v[i] = py::cast< int >(py_dimsy[i]); + + + std::vector< int > dimsp_v(py_dimsp.size()); + for (auto i = 0; i < py_dimsp.size(); i++) + dimsp_v[i] = py::cast< int >(py_dimsp[i]); + + + // Cast the ranges arrays + std::vector< int * > ranges_v(py_ranges.size()); + for (int i = 0; i < py_ranges.size(); i++) + ranges_v[i] = (int *) py::cast< int64_t >(py_ranges[i]); + int **ranges = (int **) ranges_v.data(); + + // for (auto i: ranges_v) + // std::cout << " " << (long) i << " "; + // std::cout << std::endl; + + //for (auto i=0; i<7; i++) + // std::cout << " " << (long) ranges[i] << " "; + //std::cout << std::endl; + + std::vector< int > shapeout_v(py_shapeout.size()); + for (auto i = 0; i < py_shapeout.size(); i++) + shapeout_v[i] = py::cast< int >(py_shapeout[i]); + + TYPE *out = (TYPE *)py::cast< int64_t >(out_void[0]); + // std::cout << "out_ptr : " << (long) out << std::endl; + + std::vector < TYPE * > arg_v(py_arg.size()); + for (int i = 0; i < py_arg.size(); i++) + arg_v[i] = (TYPE *) py::cast< int64_t >(py_arg[i]); + TYPE **arg = (TYPE **) arg_v.data(); + + std::vector > argshape_v(py_argshape.size()); + for (auto i = 0; i < py_argshape.size(); i++) { + py::tuple tmp = py_argshape[i]; + std::vector< int > tmp_v(tmp.size()); + for (auto j = 0; j < tmp.size(); j++) + tmp_v[j] = py::cast< int >(tmp[j]); + argshape_v[i] = tmp_v; + } + +// for (auto i : argshape_v) +// for (auto j : i) +// std::cout << j << " " ; + + return KeOps_module< TYPE >::launch_kernel(tagHostDevice, + dimY, + nx, + ny, + tagI, + tagZero, + use_half, + tag1D2D, + dimred, + cuda_block_size, + use_chunk_mode, + indsi_v, + indsj_v, + indsp_v, + dimout, + dimsx_v, + dimsy_v, + dimsp_v, + ranges, + shapeout_v, + out, + arg, + argshape_v); + } + +}; +///////////////////////////////////////////////////////////////////////////////// +// PyBind11 entry point // +///////////////////////////////////////////////////////////////////////////////// + + +PYBIND11_MODULE(pykeops_nvrtc, m) { +m.doc() = "pyKeOps: KeOps for pytorch through pybind11 (pytorch flavour)."; + +py::class_< KeOps_module_python< float > >(m, "KeOps_module_float") +.def(py::init()) +.def("__call__", &KeOps_module_python< float >::operator()); + +py::class_< KeOps_module_python< double > >(m, "KeOps_module_double") +.def(py::init()) +.def("__call__", &KeOps_module_python< double >::operator()); + +py::class_< KeOps_module_python< half2 > >(m, "KeOps_module_half2") +.def(py::init()) +.def("__call__", &KeOps_module_python< half2 >::operator()); +} diff --git a/pykeops/pykeops/common/utils.py b/pykeops/pykeops/common/utils.py index 104485ee9..d4d37fe7d 100644 --- a/pykeops/pykeops/common/utils.py +++ b/pykeops/pykeops/common/utils.py @@ -1,8 +1,3 @@ -import fcntl -import functools -import importlib.util -import os - import pykeops.config c_type = dict(float16="half2", float32="float", float64="double") diff --git a/pykeops/setup.py b/pykeops/setup.py index d8154d91f..9dc2c2fd1 100644 --- a/pykeops/setup.py +++ b/pykeops/setup.py @@ -12,6 +12,16 @@ with open(os.path.join(here, "pykeops", "keops_version"), encoding="utf-8") as v: current_version = v.read().rstrip() +# TODO fix this (issues with symlinks on windows ? -> moving to pyproject.toml ?) +if os.name == "nt": + with open(os.path.join(here, "..", "keops_version"), encoding="utf-8") as v: + current_version = v.read().rstrip() + # copy the content to pykeops/keops_version + with open( + os.path.join(here, "pykeops", "keops_version"), "w", encoding="utf-8" + ) as v: + v.write(current_version) + # Get the long description from the README file with open(path.join(here, "pykeops", "readme.md"), encoding="utf-8") as f: long_description = f.read() @@ -62,6 +72,7 @@ "licence.txt", "keops_version", "common/keops_io/pykeops_nvrtc.cpp", + "common/keops_io/pykeops_nvrtc_win.cpp", ], }, install_requires=["numpy", "pybind11", "keopscore"],