From 2460b3aa5dc0e90ad01f5159ebc581f5bb1cf9b0 Mon Sep 17 00:00:00 2001 From: Louis Pujol Date: Mon, 19 May 2025 15:26:11 +0200 Subject: [PATCH 01/25] remove unused imports --- pykeops/pykeops/common/utils.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/pykeops/pykeops/common/utils.py b/pykeops/pykeops/common/utils.py index 104485ee9..d4d37fe7d 100644 --- a/pykeops/pykeops/common/utils.py +++ b/pykeops/pykeops/common/utils.py @@ -1,8 +1,3 @@ -import fcntl -import functools -import importlib.util -import os - import pykeops.config c_type = dict(float16="half2", float32="float", float64="double") From 143537cb95876a611dcd4f5a2d2d883ef4b82355 Mon Sep 17 00:00:00 2001 From: Louis Pujol <52413616+Louis-Pujol@users.noreply.github.com> Date: Mon, 19 May 2025 16:06:46 +0200 Subject: [PATCH 02/25] add windows_compilations subpackage to keopscore --- .../windows_compilations/__init__.py | 27 ++++ .../keopscore/windows_compilations/compile.py | 126 ++++++++++++++++++ .../windows_compilations/compile_nvrtc_jit.py | 44 ++++++ .../compile_pykeops_cpp_module.py | 34 +++++ .../compile_pykeops_nvrtc.py | 44 ++++++ .../windows_compilations/cuda_detection.py | 42 ++++++ .../windows_compilations/detection.py | 64 +++++++++ .../keopscore/windows_compilations/globals.py | 5 + .../templates/CMakeLists.txt | 39 ++++++ .../keopscore/windows_compilations/utils.py | 31 +++++ 10 files changed, 456 insertions(+) create mode 100644 keopscore/keopscore/windows_compilations/__init__.py create mode 100644 keopscore/keopscore/windows_compilations/compile.py create mode 100644 keopscore/keopscore/windows_compilations/compile_nvrtc_jit.py create mode 100644 keopscore/keopscore/windows_compilations/compile_pykeops_cpp_module.py create mode 100644 keopscore/keopscore/windows_compilations/compile_pykeops_nvrtc.py create mode 100644 keopscore/keopscore/windows_compilations/cuda_detection.py create mode 100644 keopscore/keopscore/windows_compilations/detection.py create mode 100644 keopscore/keopscore/windows_compilations/globals.py create mode 100644 keopscore/keopscore/windows_compilations/templates/CMakeLists.txt create mode 100644 keopscore/keopscore/windows_compilations/utils.py diff --git a/keopscore/keopscore/windows_compilations/__init__.py b/keopscore/keopscore/windows_compilations/__init__.py new file mode 100644 index 000000000..985c7f110 --- /dev/null +++ b/keopscore/keopscore/windows_compilations/__init__.py @@ -0,0 +1,27 @@ +"""Initialize the windows_compilations package and create dictionaries with found includes/libs/dlls""" + +from .compile import compile +from .compile_nvrtc_jit import compile_nvrtc_jit +from .compile_pykeops_cpp_module import compile_pykeops_cpp_module +from .compile_pykeops_nvrtc import compile_pykeops_nvrtc +from .detection import ( + dlls, + include_dirs, + keops_available, + lib_dirs, + lib_names, +) +from .globals import tmp_dir + +__all__ = [ + "compile", + "tmp_dir", + "include_dirs", + "lib_dirs", + "lib_names", + "dlls", + "compile_pykeops_cpp_module", + "compile_nvrtc_jit", + "compile_pykeops_nvrtc", + "keops_available", + ] diff --git a/keopscore/keopscore/windows_compilations/compile.py b/keopscore/keopscore/windows_compilations/compile.py new file mode 100644 index 000000000..d07058500 --- /dev/null +++ b/keopscore/keopscore/windows_compilations/compile.py @@ -0,0 +1,126 @@ +import os +import shutil +import sysconfig +import uuid +from pathlib import Path + +from .globals import tmp_dir + +path_type = str | os.PathLike | bytes +_empty_list = [] + +def compile( + source_file: path_type, + project_name: str | None = None, + includes: list[path_type] | None = _empty_list, + link_dirs: list[path_type] | None = _empty_list, + links: list[str] | None = _empty_list, + macros: list[str] | None = _empty_list, + suffix: str = ".dll", + output_dir=".", + print_cmakelists=False, + show_cmake_commands_output=False, + clean_tmp_build_dir=True, + ): + + output_dir = Path(output_dir) + + if project_name is None: + project_name = "".join(Path(source_file).name.split(".")[:-1]) + + includes_str = "" + for include in includes: + includes_str += f"include_directories(\"{include!s}\")\n" + includes_str = includes_str.replace("\\", "/") + + link_dirs_str = "" + for link in link_dirs: + link_dirs_str += f"link_directories(\"{link!s}\")\n" + link_dirs_str = link_dirs_str.replace("\\", "/") + + macros_str = "" + for macro in macros: + macros_str += f"add_definitions({macro!s})\n" + + if len(links) == 0: + links = "" + + else: + inline_links = " ".join(links) + links = f"target_link_libraries(${{PROJECT_NAME}} {inline_links})" + + with Path.open(Path(__file__).parent / "templates" / "CMakeLists.txt") as f: + content = f.read() + + includes = includes_str + link_dirs = link_dirs_str + macros = macros_str + source_file = str(Path(source_file).resolve()).replace("\\", "/") + + if "pyd" in suffix: + suffix = sysconfig.get_config_var('EXT_SUFFIX') + + fields = ["source_file", "project_name", "includes", "link_dirs", "links", "macros", "suffix"] + + for field in fields: + content = content.replace( + f"***{field}***", + locals()[field] + ) + + + import os + import subprocess + + cwd = Path.cwd() + + # Go to tmp dir + + tmp_build = tmp_dir / str(uuid.uuid4()) + + tmp_build.mkdir() + if (tmp_build / "build").is_dir(): + shutil.rmtree(tmp_build / "build") + + tmp_build.mkdir(exist_ok=True) + + with Path.open(Path(tmp_build) / "CMakeLists.txt", "w") as f: + f.write(content) + + if print_cmakelists: + print(content) + + os.chdir(tmp_build) + Path("build").mkdir() + os.chdir("build") + + if not show_cmake_commands_output: + + with Path.open(tmp_build / "log", "w") as log_file: + + subprocess.check_call(["cmake", ".."], stdout=log_file) + subprocess.check_call(["cmake", "--build", ".", "--config", "Release"], stdout=log_file) + + else: + subprocess.check_call(["cmake", ".."]) + subprocess.check_call(["cmake", "--build", ".", "--config", "Release"]) + + # Back to previous working directory + os.chdir(cwd) + + # Define the source and destination directories + Path(output_dir).mkdir(exist_ok=True) + + source_dir = tmp_build / "build" / "Release" + + # Copy the contents of the source directory to the destination directory + for item in os.listdir(source_dir): + s = source_dir / item + d = output_dir / item + + # Copy files or directories + if str(s).endswith(suffix): + shutil.copy2(s, d) + + if clean_tmp_build_dir: + shutil.rmtree(tmp_build) diff --git a/keopscore/keopscore/windows_compilations/compile_nvrtc_jit.py b/keopscore/keopscore/windows_compilations/compile_nvrtc_jit.py new file mode 100644 index 000000000..5317cca21 --- /dev/null +++ b/keopscore/keopscore/windows_compilations/compile_nvrtc_jit.py @@ -0,0 +1,44 @@ +from pathlib import Path + +from .compile import compile +from .detection import ( + include_dirs, + lib_dirs, + lib_names, +) +from .utils import find_package_location + + +def compile_nvrtc_jit(build_folder): + + keops_dir = Path(find_package_location("keopscore")).parent + source_file = keops_dir / "binders" / "nvrtc" / "nvrtc_jit.cpp" + + + macros = [ + "-DMAXIDGPU=0", + "-DMAXTHREADSPERBLOCK0=1024", + "-DSHAREDMEMPERBLOCK0=49152", + "-DnvrtcGetTARGET=nvrtcGetCUBIN", + "-DnvrtcGetTARGETSize=nvrtcGetCUBINSize", + "-DARCHTAG=\"sm\"" + ] + + + compile( + source_file=source_file, + macros=macros, + includes= [ + include_dirs[key] for key in ["keops", "cuda"] + ], + link_dirs=[ + lib_dirs[key] for key in ["cuda"] + ], + links=[ + lib_names[key] for key in ["cuda", "nvrtc", "cudart"] + ], + suffix='.dll', + output_dir=build_folder, + print_cmakelists=False, + show_cmake_commands_output=False + ) diff --git a/keopscore/keopscore/windows_compilations/compile_pykeops_cpp_module.py b/keopscore/keopscore/windows_compilations/compile_pykeops_cpp_module.py new file mode 100644 index 000000000..cb40f10b6 --- /dev/null +++ b/keopscore/keopscore/windows_compilations/compile_pykeops_cpp_module.py @@ -0,0 +1,34 @@ +from .compile import compile +from .detection import include_dirs, lib_dirs, lib_names + + +def compile_pykeops_cpp_module(source_file, build_folder): + """This built-in compilation config serves to compile pykeops cpp modules + + Parameters + ---------- + source_file + Location of the source cpp file + build_folder + The KeOps build folder + + """ + + compile( + source_file=source_file, + includes= [ + include_dirs["python"], + include_dirs["pybind11"], + include_dirs["keops"] + ], + link_dirs=[ + lib_dirs["python"] + ], + links=[ + lib_names["python"] + ], + suffix='.pyd', + output_dir=build_folder, + print_cmakelists=False, + show_cmake_commands_output=False + ) diff --git a/keopscore/keopscore/windows_compilations/compile_pykeops_nvrtc.py b/keopscore/keopscore/windows_compilations/compile_pykeops_nvrtc.py new file mode 100644 index 000000000..fdae10309 --- /dev/null +++ b/keopscore/keopscore/windows_compilations/compile_pykeops_nvrtc.py @@ -0,0 +1,44 @@ +from pathlib import Path + +from .compile import compile +from .detection import ( + include_dirs, + lib_dirs, + lib_names, +) +from .utils import find_package_location + + +def compile_pykeops_nvrtc(build_folder): + + pykeops_dir = Path(find_package_location("pykeops")).parent + source_file = pykeops_dir / "common" / "keops_io" / "pykeops_nvrtc.cpp" + + + macros = [ + "-DMAXIDGPU=0", + "-DMAXTHREADSPERBLOCK0=1024", + "-DSHAREDMEMPERBLOCK0=49152", + "-DnvrtcGetTARGET=nvrtcGetCUBIN", + "-DnvrtcGetTARGETSize=nvrtcGetCUBINSize", + "-DARCHTAG=\"sm\"" + ] + + + compile( + source_file=source_file, + macros=macros, + includes= [ + include_dirs[key] for key in ["python", "pybind11", "keops", "cuda"] + ], + link_dirs=[ + lib_dirs[key] for key in ["python", "cuda"] + ], + links=[ + lib_names[key] for key in ["cuda", "nvrtc", "cudart", "python"] + ], + suffix='.pyd', + output_dir=build_folder, + print_cmakelists=False, + show_cmake_commands_output=False, + ) diff --git a/keopscore/keopscore/windows_compilations/cuda_detection.py b/keopscore/keopscore/windows_compilations/cuda_detection.py new file mode 100644 index 000000000..18d11c8bc --- /dev/null +++ b/keopscore/keopscore/windows_compilations/cuda_detection.py @@ -0,0 +1,42 @@ +import os +from ctypes.util import find_library +from pathlib import Path + + +def detect_cuda_toolkit(): + + if os.environ["CUDA_PATH"]: + + output = {} + + cuda_path = Path(os.environ["CUDA_PATH"]) # base path for cuda installation (including bin, lib, include, etc.) + cuda_bin = Path(cuda_path, "bin") # where the dlls are located + + if find_library("nvcuda") is not None: + output["dll_cuda"] = find_library("nvcuda") + + for file in cuda_bin.iterdir(): + + if file.name.startswith("cudart") and file.name.endswith(".dll"): + output["dll_cudart"] = str(file) + + if file.name.startswith("nvrtc-builtins") and file.name.endswith(".dll"): + output["dll_nvrtc"] = str(file) + + # See the files in Path(cuda_path, "cmake") for something more automatic here + cuda_include = Path(cuda_path, "include") + if cuda_include.is_dir(): + output["include_dir"] = str(cuda_include) + + + cuda_libs = Path(cuda_path, "lib", "x64") + if cuda_libs.is_dir(): + output["lib_dirs"] = str(cuda_libs) + + output["lib_names"] = {} + for key in ["cuda", "nvrtc", "cudart"]: + + if (cuda_libs / (key + ".lib")).is_file(): + output["lib_names"][key] = key + + return output diff --git a/keopscore/keopscore/windows_compilations/detection.py b/keopscore/keopscore/windows_compilations/detection.py new file mode 100644 index 000000000..f745f6dda --- /dev/null +++ b/keopscore/keopscore/windows_compilations/detection.py @@ -0,0 +1,64 @@ +import sys +import sysconfig +from pathlib import Path + +import pybind11 + +from .cuda_detection import detect_cuda_toolkit +from .utils import find_package_location + +include_dirs = {} +lib_dirs = {} +lib_names = {} +dlls = {} + + +try: + location_keops_init = find_package_location("keopscore") + include_dirs["keops"] = str(Path(location_keops_init).parent) + keops_available = True +except ImportError: + keops_available = False + + + +include_dirs["pybind11"] = pybind11.get_include() + + +# Python +include_dirs["python"] = sysconfig.get_path('include') + +if sys.platform == "win32": + # On Windows, get the path to the Python DLL + python_libs = Path(sysconfig.get_path('include')).parent / "libs" + + # Get the path to the standard library (Lib) + if python_libs.is_dir(): + lib_dirs["python"] = str(python_libs) + + version = str(sys.version_info.major) + str(sys.version_info.minor) + if (Path(lib_dirs["python"]) / ("python" + version + ".lib")).is_file(): + lib_names["python"] = "python" + version +else: + # On Unix-like systems, use sysconfig to get the library directory and name + lib_dirs["python"] = Path(sysconfig.get_config_var('LIBDIR')) + lib_names["python"] = sysconfig.get_config_var('LDLIBRARY') + + + +# Cuda +cuda_config = detect_cuda_toolkit() + +for key in ["cuda", "nvrtc", "cudart"]: + + if f"dll_{key}" in cuda_config: + dlls[key] = cuda_config[f"dll_{key}"] + + if key in cuda_config["lib_names"]: + lib_names[key] = cuda_config["lib_names"][key] + +if "include_dir" in cuda_config: + include_dirs["cuda"] = cuda_config["include_dir"] + +if "lib_dirs" in cuda_config: + lib_dirs["cuda"] = cuda_config["lib_dirs"] diff --git a/keopscore/keopscore/windows_compilations/globals.py b/keopscore/keopscore/windows_compilations/globals.py new file mode 100644 index 000000000..9eb9504c8 --- /dev/null +++ b/keopscore/keopscore/windows_compilations/globals.py @@ -0,0 +1,5 @@ +from pathlib import Path + +tmp_dir = Path.home() / "tmp" / "cppcompile_windows" +(Path.home() / "tmp").mkdir(exist_ok=True) +tmp_dir.mkdir(exist_ok=True) diff --git a/keopscore/keopscore/windows_compilations/templates/CMakeLists.txt b/keopscore/keopscore/windows_compilations/templates/CMakeLists.txt new file mode 100644 index 000000000..458a6b5af --- /dev/null +++ b/keopscore/keopscore/windows_compilations/templates/CMakeLists.txt @@ -0,0 +1,39 @@ +cmake_minimum_required(VERSION 3.10) +project(***project_name***) + +# Set C++ standard +set(CMAKE_CXX_STANDARD 11) + +# Set compiler options +if(MSVC) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /LD /Ox /GL /openmp") +else() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -O3 -fopenmp") +endif() + +# Preprocessor macros +***macros*** + +# Include directories +***includes*** + +# Link directories +***link_dirs*** + +# Source file +set(SOURCE_FILE "***source_file***") + +# Add shared library +add_library(${PROJECT_NAME} SHARED ${SOURCE_FILE}) + +# Link +***links*** + +# Set the output path for the shared library and ensure the .pyd extension +set_target_properties(${PROJECT_NAME} PROPERTIES + LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}" + OUTPUT_NAME "***project_name***" + SUFFIX "***suffix***" # Ensure the correct file extension for Python modules + MSVC_WARNING_LEVEL 0 + MSVC_WARNING_DISABLE 8029 # Warning because we are building in a tmp dir +) diff --git a/keopscore/keopscore/windows_compilations/utils.py b/keopscore/keopscore/windows_compilations/utils.py new file mode 100644 index 000000000..320a06009 --- /dev/null +++ b/keopscore/keopscore/windows_compilations/utils.py @@ -0,0 +1,31 @@ +import importlib.util + + +def find_package_location(package_name: str) -> str: + """Find the __init__ file of a given package + + This function does not import the package, it was written to avoid circular + imports with KeOps + + Parameters + ---------- + package_name + The name of the package + + Returns + ------- + str + The path to the package + + Raises + ------ + ImportError + If the package cannot be loaded + + """ + spec = importlib.util.find_spec(package_name) + if spec.origin: + return spec.origin + else: + message = f"Package '{package_name}' not found." + raise ImportError(message) From c17d6e39f2a71332dd4f1a21ef392aa3f89cd911 Mon Sep 17 00:00:00 2001 From: Louis Pujol <52413616+Louis-Pujol@users.noreply.github.com> Date: Mon, 19 May 2025 16:07:55 +0200 Subject: [PATCH 03/25] Use vectors in utils_pe.h --- keopscore/keopscore/include/utils_pe.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/keopscore/keopscore/include/utils_pe.h b/keopscore/keopscore/include/utils_pe.h index 55c91babc..56ef5e99b 100644 --- a/keopscore/keopscore/include/utils_pe.h +++ b/keopscore/keopscore/include/utils_pe.h @@ -1,5 +1,6 @@ #include #include +#include #define NVRTC_SAFE_CALL(x) \ do { \ @@ -68,7 +69,8 @@ void load_args_FromHost(CUdeviceptr &p_data, TYPE *out, TYPE *&out_d, int nargs, TYPE **arg, TYPE **&arg_d, const std::vector> &argshape, signed long int sizeout) { - signed long int sizes[nargs]; + + std::vector sizes(nargs); signed long int totsize = sizeout; for (int k = 0; k < nargs; k++) { sizes[k] = std::accumulate(argshape[k].begin(), argshape[k].end(), 1, @@ -83,7 +85,7 @@ void load_args_FromHost(CUdeviceptr &p_data, TYPE *out, TYPE *&out_d, int nargs, TYPE *dataloc = (TYPE *)(arg_d + nargs); // host array of pointers to device data - TYPE *ph[nargs]; + std::vector ph(nargs); out_d = dataloc; dataloc += sizeout; @@ -95,5 +97,5 @@ void load_args_FromHost(CUdeviceptr &p_data, TYPE *out, TYPE *&out_d, int nargs, } // copy array of pointers - CUDA_SAFE_CALL(cuMemcpyHtoD((CUdeviceptr)arg_d, ph, nargs * sizeof(TYPE *))); + CUDA_SAFE_CALL(cuMemcpyHtoD((CUdeviceptr)arg_d, ph.data(), nargs * sizeof(TYPE *))); } From 4abebfd0e88814f68d68b2cfb0a5a0d799280c80 Mon Sep 17 00:00:00 2001 From: Louis Pujol <52413616+Louis-Pujol@users.noreply.github.com> Date: Mon, 19 May 2025 17:20:08 +0200 Subject: [PATCH 04/25] lint --- .../windows_compilations/__init__.py | 2 +- .../keopscore/windows_compilations/compile.py | 53 +++++++++++-------- .../windows_compilations/compile_nvrtc_jit.py | 20 +++---- .../compile_pykeops_cpp_module.py | 16 +++--- .../compile_pykeops_nvrtc.py | 18 ++----- .../windows_compilations/cuda_detection.py | 7 +-- .../windows_compilations/detection.py | 10 ++-- .../keopscore/windows_compilations/utils.py | 2 +- 8 files changed, 57 insertions(+), 71 deletions(-) diff --git a/keopscore/keopscore/windows_compilations/__init__.py b/keopscore/keopscore/windows_compilations/__init__.py index 985c7f110..c8ba3a787 100644 --- a/keopscore/keopscore/windows_compilations/__init__.py +++ b/keopscore/keopscore/windows_compilations/__init__.py @@ -24,4 +24,4 @@ "compile_nvrtc_jit", "compile_pykeops_nvrtc", "keops_available", - ] +] diff --git a/keopscore/keopscore/windows_compilations/compile.py b/keopscore/keopscore/windows_compilations/compile.py index d07058500..f0c556ea1 100644 --- a/keopscore/keopscore/windows_compilations/compile.py +++ b/keopscore/keopscore/windows_compilations/compile.py @@ -9,19 +9,20 @@ path_type = str | os.PathLike | bytes _empty_list = [] + def compile( - source_file: path_type, - project_name: str | None = None, - includes: list[path_type] | None = _empty_list, - link_dirs: list[path_type] | None = _empty_list, - links: list[str] | None = _empty_list, - macros: list[str] | None = _empty_list, - suffix: str = ".dll", - output_dir=".", - print_cmakelists=False, - show_cmake_commands_output=False, - clean_tmp_build_dir=True, - ): + source_file: path_type, + project_name: str | None = None, + includes: list[path_type] | None = _empty_list, + link_dirs: list[path_type] | None = _empty_list, + links: list[str] | None = _empty_list, + macros: list[str] | None = _empty_list, + suffix: str = ".dll", + output_dir=".", + print_cmakelists=False, + show_cmake_commands_output=False, + clean_tmp_build_dir=True, +): output_dir = Path(output_dir) @@ -30,12 +31,12 @@ def compile( includes_str = "" for include in includes: - includes_str += f"include_directories(\"{include!s}\")\n" + includes_str += f'include_directories("{include!s}")\n' includes_str = includes_str.replace("\\", "/") link_dirs_str = "" for link in link_dirs: - link_dirs_str += f"link_directories(\"{link!s}\")\n" + link_dirs_str += f'link_directories("{link!s}")\n' link_dirs_str = link_dirs_str.replace("\\", "/") macros_str = "" @@ -58,16 +59,20 @@ def compile( source_file = str(Path(source_file).resolve()).replace("\\", "/") if "pyd" in suffix: - suffix = sysconfig.get_config_var('EXT_SUFFIX') - - fields = ["source_file", "project_name", "includes", "link_dirs", "links", "macros", "suffix"] + suffix = sysconfig.get_config_var("EXT_SUFFIX") + + fields = [ + "source_file", + "project_name", + "includes", + "link_dirs", + "links", + "macros", + "suffix", + ] for field in fields: - content = content.replace( - f"***{field}***", - locals()[field] - ) - + content = content.replace(f"***{field}***", locals()[field]) import os import subprocess @@ -99,7 +104,9 @@ def compile( with Path.open(tmp_build / "log", "w") as log_file: subprocess.check_call(["cmake", ".."], stdout=log_file) - subprocess.check_call(["cmake", "--build", ".", "--config", "Release"], stdout=log_file) + subprocess.check_call( + ["cmake", "--build", ".", "--config", "Release"], stdout=log_file + ) else: subprocess.check_call(["cmake", ".."]) diff --git a/keopscore/keopscore/windows_compilations/compile_nvrtc_jit.py b/keopscore/keopscore/windows_compilations/compile_nvrtc_jit.py index 5317cca21..23bd71679 100644 --- a/keopscore/keopscore/windows_compilations/compile_nvrtc_jit.py +++ b/keopscore/keopscore/windows_compilations/compile_nvrtc_jit.py @@ -14,31 +14,23 @@ def compile_nvrtc_jit(build_folder): keops_dir = Path(find_package_location("keopscore")).parent source_file = keops_dir / "binders" / "nvrtc" / "nvrtc_jit.cpp" - macros = [ "-DMAXIDGPU=0", "-DMAXTHREADSPERBLOCK0=1024", "-DSHAREDMEMPERBLOCK0=49152", "-DnvrtcGetTARGET=nvrtcGetCUBIN", "-DnvrtcGetTARGETSize=nvrtcGetCUBINSize", - "-DARCHTAG=\"sm\"" + '-DARCHTAG="sm"', ] - compile( source_file=source_file, macros=macros, - includes= [ - include_dirs[key] for key in ["keops", "cuda"] - ], - link_dirs=[ - lib_dirs[key] for key in ["cuda"] - ], - links=[ - lib_names[key] for key in ["cuda", "nvrtc", "cudart"] - ], - suffix='.dll', + includes=[include_dirs[key] for key in ["keops", "cuda"]], + link_dirs=[lib_dirs[key] for key in ["cuda"]], + links=[lib_names[key] for key in ["cuda", "nvrtc", "cudart"]], + suffix=".dll", output_dir=build_folder, print_cmakelists=False, - show_cmake_commands_output=False + show_cmake_commands_output=False, ) diff --git a/keopscore/keopscore/windows_compilations/compile_pykeops_cpp_module.py b/keopscore/keopscore/windows_compilations/compile_pykeops_cpp_module.py index cb40f10b6..66e84b94c 100644 --- a/keopscore/keopscore/windows_compilations/compile_pykeops_cpp_module.py +++ b/keopscore/keopscore/windows_compilations/compile_pykeops_cpp_module.py @@ -16,19 +16,15 @@ def compile_pykeops_cpp_module(source_file, build_folder): compile( source_file=source_file, - includes= [ + includes=[ include_dirs["python"], include_dirs["pybind11"], - include_dirs["keops"] + include_dirs["keops"], ], - link_dirs=[ - lib_dirs["python"] - ], - links=[ - lib_names["python"] - ], - suffix='.pyd', + link_dirs=[lib_dirs["python"]], + links=[lib_names["python"]], + suffix=".pyd", output_dir=build_folder, print_cmakelists=False, - show_cmake_commands_output=False + show_cmake_commands_output=False, ) diff --git a/keopscore/keopscore/windows_compilations/compile_pykeops_nvrtc.py b/keopscore/keopscore/windows_compilations/compile_pykeops_nvrtc.py index fdae10309..7fac03c00 100644 --- a/keopscore/keopscore/windows_compilations/compile_pykeops_nvrtc.py +++ b/keopscore/keopscore/windows_compilations/compile_pykeops_nvrtc.py @@ -14,30 +14,22 @@ def compile_pykeops_nvrtc(build_folder): pykeops_dir = Path(find_package_location("pykeops")).parent source_file = pykeops_dir / "common" / "keops_io" / "pykeops_nvrtc.cpp" - macros = [ "-DMAXIDGPU=0", "-DMAXTHREADSPERBLOCK0=1024", "-DSHAREDMEMPERBLOCK0=49152", "-DnvrtcGetTARGET=nvrtcGetCUBIN", "-DnvrtcGetTARGETSize=nvrtcGetCUBINSize", - "-DARCHTAG=\"sm\"" + '-DARCHTAG="sm"', ] - compile( source_file=source_file, macros=macros, - includes= [ - include_dirs[key] for key in ["python", "pybind11", "keops", "cuda"] - ], - link_dirs=[ - lib_dirs[key] for key in ["python", "cuda"] - ], - links=[ - lib_names[key] for key in ["cuda", "nvrtc", "cudart", "python"] - ], - suffix='.pyd', + includes=[include_dirs[key] for key in ["python", "pybind11", "keops", "cuda"]], + link_dirs=[lib_dirs[key] for key in ["python", "cuda"]], + links=[lib_names[key] for key in ["cuda", "nvrtc", "cudart", "python"]], + suffix=".pyd", output_dir=build_folder, print_cmakelists=False, show_cmake_commands_output=False, diff --git a/keopscore/keopscore/windows_compilations/cuda_detection.py b/keopscore/keopscore/windows_compilations/cuda_detection.py index 18d11c8bc..5c5d1dbab 100644 --- a/keopscore/keopscore/windows_compilations/cuda_detection.py +++ b/keopscore/keopscore/windows_compilations/cuda_detection.py @@ -9,8 +9,10 @@ def detect_cuda_toolkit(): output = {} - cuda_path = Path(os.environ["CUDA_PATH"]) # base path for cuda installation (including bin, lib, include, etc.) - cuda_bin = Path(cuda_path, "bin") # where the dlls are located + cuda_path = Path( + os.environ["CUDA_PATH"] + ) # base path for cuda installation (including bin, lib, include, etc.) + cuda_bin = Path(cuda_path, "bin") # where the dlls are located if find_library("nvcuda") is not None: output["dll_cuda"] = find_library("nvcuda") @@ -28,7 +30,6 @@ def detect_cuda_toolkit(): if cuda_include.is_dir(): output["include_dir"] = str(cuda_include) - cuda_libs = Path(cuda_path, "lib", "x64") if cuda_libs.is_dir(): output["lib_dirs"] = str(cuda_libs) diff --git a/keopscore/keopscore/windows_compilations/detection.py b/keopscore/keopscore/windows_compilations/detection.py index f745f6dda..57548129b 100644 --- a/keopscore/keopscore/windows_compilations/detection.py +++ b/keopscore/keopscore/windows_compilations/detection.py @@ -21,16 +21,15 @@ keops_available = False - include_dirs["pybind11"] = pybind11.get_include() # Python -include_dirs["python"] = sysconfig.get_path('include') +include_dirs["python"] = sysconfig.get_path("include") if sys.platform == "win32": # On Windows, get the path to the Python DLL - python_libs = Path(sysconfig.get_path('include')).parent / "libs" + python_libs = Path(sysconfig.get_path("include")).parent / "libs" # Get the path to the standard library (Lib) if python_libs.is_dir(): @@ -41,9 +40,8 @@ lib_names["python"] = "python" + version else: # On Unix-like systems, use sysconfig to get the library directory and name - lib_dirs["python"] = Path(sysconfig.get_config_var('LIBDIR')) - lib_names["python"] = sysconfig.get_config_var('LDLIBRARY') - + lib_dirs["python"] = Path(sysconfig.get_config_var("LIBDIR")) + lib_names["python"] = sysconfig.get_config_var("LDLIBRARY") # Cuda diff --git a/keopscore/keopscore/windows_compilations/utils.py b/keopscore/keopscore/windows_compilations/utils.py index 320a06009..6a005d08e 100644 --- a/keopscore/keopscore/windows_compilations/utils.py +++ b/keopscore/keopscore/windows_compilations/utils.py @@ -27,5 +27,5 @@ def find_package_location(package_name: str) -> str: if spec.origin: return spec.origin else: - message = f"Package '{package_name}' not found." + message = f"Package '{package_name}' not found." raise ImportError(message) From 5bd1c0e445324f007096242ced8a1942bc7e6b54 Mon Sep 17 00:00:00 2001 From: Louis Pujol <52413616+Louis-Pujol@users.noreply.github.com> Date: Sat, 28 Jun 2025 09:45:34 +0200 Subject: [PATCH 05/25] move windows compilmations to an independant package --- keopswindows/pyproject.toml | 80 +++++++++++++++++++ .../src/keopswindows}/__init__.py | 1 + .../src/keopswindows}/compile.py | 0 .../src/keopswindows}/compile_nvrtc_jit.py | 1 + .../compile_pykeops_cpp_module.py | 1 + .../keopswindows}/compile_pykeops_nvrtc.py | 1 + .../src/keopswindows}/cuda_detection.py | 2 + .../src/keopswindows}/detection.py | 0 .../src/keopswindows}/globals.py | 0 .../keopswindows}/templates/CMakeLists.txt | 0 .../src/keopswindows}/utils.py | 0 11 files changed, 86 insertions(+) create mode 100644 keopswindows/pyproject.toml rename {keopscore/keopscore/windows_compilations => keopswindows/src/keopswindows}/__init__.py (90%) rename {keopscore/keopscore/windows_compilations => keopswindows/src/keopswindows}/compile.py (100%) rename {keopscore/keopscore/windows_compilations => keopswindows/src/keopswindows}/compile_nvrtc_jit.py (93%) rename {keopscore/keopscore/windows_compilations => keopswindows/src/keopswindows}/compile_pykeops_cpp_module.py (92%) rename {keopscore/keopscore/windows_compilations => keopswindows/src/keopswindows}/compile_pykeops_nvrtc.py (93%) rename {keopscore/keopscore/windows_compilations => keopswindows/src/keopswindows}/cuda_detection.py (93%) rename {keopscore/keopscore/windows_compilations => keopswindows/src/keopswindows}/detection.py (100%) rename {keopscore/keopscore/windows_compilations => keopswindows/src/keopswindows}/globals.py (100%) rename {keopscore/keopscore/windows_compilations => keopswindows/src/keopswindows}/templates/CMakeLists.txt (100%) rename {keopscore/keopscore/windows_compilations => keopswindows/src/keopswindows}/utils.py (100%) diff --git a/keopswindows/pyproject.toml b/keopswindows/pyproject.toml new file mode 100644 index 000000000..0eeb255e1 --- /dev/null +++ b/keopswindows/pyproject.toml @@ -0,0 +1,80 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "keopswindows" +version = "0.1.0" +authors = [ + {name = "Louis Pujol"} +] +description = "KeOps on windows: adapters for KeOps to work on Windows" +requires-python = ">=3.7" +dependencies = [ + "cmake", +] + +[project.optional-dependencies] +dev = [ + "pytest", + "pytest-cov", + "pre-commit", +] + + +[tool.pytest.ini_options] +minversion = "6.0" +addopts = [ + "-ra", + "--durations=10", + "--showlocals", + "--strict-markers", + "--strict-config", + "--doctest-modules", + "--cov=keopswindows", + "--cov-report=term-missing", + "--cov-report=html", + "--cov-report=xml" + ] + +testpaths = ["src/keopswindows", "tests"] + + +# See: https://learn.scientific-python.org/development/guides/style/#PC190 +[tool.ruff] +line-length = 79 + +[tool.ruff.lint] +extend-select = [ + "B", # flake8-bugbear + "I", # isort + "ARG", # flake8-unused-arguments + "C4", # flake8-comprehensions + "EM", # flake8-errmsg + "ICN", # flake8-import-conventions + "G", # flake8-logging-format + "PGH", # pygrep-hooks + "PIE", # flake8-pie + "PL", # pylint + "PT", # flake8-pytest-style + "PTH", # flake8-use-pathlib + "RET", # flake8-return + "RUF", # Ruff-specific + "SIM", # flake8-simplify + "T20", # flake8-print + "UP", # pyupgrade + "YTT", # flake8-2020 + "EXE", # flake8-executable + "NPY", # NumPy specific rules + "FURB", # refurb + "PYI", # flake8-pyi +] + +[tool.ruff.lint.per-file-ignores] +"tests/**" = ["T201", "PLR2004"] # it is ok to print in tests + magic values in comparison +"examples/**" = ["T201"] # also ok to print in examples + + +[tool.ruff.format] +indent-style = "space" +docstring-code-format = true diff --git a/keopscore/keopscore/windows_compilations/__init__.py b/keopswindows/src/keopswindows/__init__.py similarity index 90% rename from keopscore/keopscore/windows_compilations/__init__.py rename to keopswindows/src/keopswindows/__init__.py index c8ba3a787..cc7a91321 100644 --- a/keopscore/keopscore/windows_compilations/__init__.py +++ b/keopswindows/src/keopswindows/__init__.py @@ -12,6 +12,7 @@ lib_names, ) from .globals import tmp_dir +from .cuda_detection import cuda_available __all__ = [ "compile", diff --git a/keopscore/keopscore/windows_compilations/compile.py b/keopswindows/src/keopswindows/compile.py similarity index 100% rename from keopscore/keopscore/windows_compilations/compile.py rename to keopswindows/src/keopswindows/compile.py diff --git a/keopscore/keopscore/windows_compilations/compile_nvrtc_jit.py b/keopswindows/src/keopswindows/compile_nvrtc_jit.py similarity index 93% rename from keopscore/keopscore/windows_compilations/compile_nvrtc_jit.py rename to keopswindows/src/keopswindows/compile_nvrtc_jit.py index 23bd71679..ea20ced89 100644 --- a/keopscore/keopscore/windows_compilations/compile_nvrtc_jit.py +++ b/keopswindows/src/keopswindows/compile_nvrtc_jit.py @@ -33,4 +33,5 @@ def compile_nvrtc_jit(build_folder): output_dir=build_folder, print_cmakelists=False, show_cmake_commands_output=False, + clean_tmp_build_dir=False, ) diff --git a/keopscore/keopscore/windows_compilations/compile_pykeops_cpp_module.py b/keopswindows/src/keopswindows/compile_pykeops_cpp_module.py similarity index 92% rename from keopscore/keopscore/windows_compilations/compile_pykeops_cpp_module.py rename to keopswindows/src/keopswindows/compile_pykeops_cpp_module.py index 66e84b94c..47c8f56ba 100644 --- a/keopscore/keopscore/windows_compilations/compile_pykeops_cpp_module.py +++ b/keopswindows/src/keopswindows/compile_pykeops_cpp_module.py @@ -27,4 +27,5 @@ def compile_pykeops_cpp_module(source_file, build_folder): output_dir=build_folder, print_cmakelists=False, show_cmake_commands_output=False, + clean_tmp_build_dir=False, ) diff --git a/keopscore/keopscore/windows_compilations/compile_pykeops_nvrtc.py b/keopswindows/src/keopswindows/compile_pykeops_nvrtc.py similarity index 93% rename from keopscore/keopscore/windows_compilations/compile_pykeops_nvrtc.py rename to keopswindows/src/keopswindows/compile_pykeops_nvrtc.py index 7fac03c00..77fe5fb48 100644 --- a/keopscore/keopscore/windows_compilations/compile_pykeops_nvrtc.py +++ b/keopswindows/src/keopswindows/compile_pykeops_nvrtc.py @@ -33,4 +33,5 @@ def compile_pykeops_nvrtc(build_folder): output_dir=build_folder, print_cmakelists=False, show_cmake_commands_output=False, + clean_tmp_build_dir=False, ) diff --git a/keopscore/keopscore/windows_compilations/cuda_detection.py b/keopswindows/src/keopswindows/cuda_detection.py similarity index 93% rename from keopscore/keopscore/windows_compilations/cuda_detection.py rename to keopswindows/src/keopswindows/cuda_detection.py index 5c5d1dbab..43513c95d 100644 --- a/keopscore/keopscore/windows_compilations/cuda_detection.py +++ b/keopswindows/src/keopswindows/cuda_detection.py @@ -2,6 +2,8 @@ from ctypes.util import find_library from pathlib import Path +cuda_available = find_library("nvcuda") is not None + def detect_cuda_toolkit(): diff --git a/keopscore/keopscore/windows_compilations/detection.py b/keopswindows/src/keopswindows/detection.py similarity index 100% rename from keopscore/keopscore/windows_compilations/detection.py rename to keopswindows/src/keopswindows/detection.py diff --git a/keopscore/keopscore/windows_compilations/globals.py b/keopswindows/src/keopswindows/globals.py similarity index 100% rename from keopscore/keopscore/windows_compilations/globals.py rename to keopswindows/src/keopswindows/globals.py diff --git a/keopscore/keopscore/windows_compilations/templates/CMakeLists.txt b/keopswindows/src/keopswindows/templates/CMakeLists.txt similarity index 100% rename from keopscore/keopscore/windows_compilations/templates/CMakeLists.txt rename to keopswindows/src/keopswindows/templates/CMakeLists.txt diff --git a/keopscore/keopscore/windows_compilations/utils.py b/keopswindows/src/keopswindows/utils.py similarity index 100% rename from keopscore/keopscore/windows_compilations/utils.py rename to keopswindows/src/keopswindows/utils.py From fdba11fd07bf7bbae3185ae2ae63fb2329422cdb Mon Sep 17 00:00:00 2001 From: Louis Pujol <52413616+Louis-Pujol@users.noreply.github.com> Date: Fri, 11 Jul 2025 09:53:19 +0200 Subject: [PATCH 06/25] move back windows compability to keopscore --- .../windows_compilations}/__init__.py | 0 .../windows_compilations}/compile.py | 0 .../compile_nvrtc_jit.py | 3 +- .../compile_pykeops_cpp_module.py | 0 .../compile_pykeops_nvrtc.py | 3 +- .../windows_compilations}/cuda_detection.py | 0 .../windows_compilations}/detection.py | 0 .../windows_compilations}/globals.py | 0 .../templates/CMakeLists.txt | 0 .../keopscore/windows_compilations}/utils.py | 0 keopswindows/pyproject.toml | 80 ---------- .../pykeops/common/keops_io/LoadKeOps_cpp.py | 130 ++++++++++++++- .../common/keops_io/pykeops_nvrtc_win.cpp | 148 ++++++++++++++++++ 13 files changed, 279 insertions(+), 85 deletions(-) rename {keopswindows/src/keopswindows => keopscore/keopscore/windows_compilations}/__init__.py (100%) rename {keopswindows/src/keopswindows => keopscore/keopscore/windows_compilations}/compile.py (100%) rename {keopswindows/src/keopswindows => keopscore/keopscore/windows_compilations}/compile_nvrtc_jit.py (86%) rename {keopswindows/src/keopswindows => keopscore/keopscore/windows_compilations}/compile_pykeops_cpp_module.py (100%) rename {keopswindows/src/keopswindows => keopscore/keopscore/windows_compilations}/compile_pykeops_nvrtc.py (92%) rename {keopswindows/src/keopswindows => keopscore/keopscore/windows_compilations}/cuda_detection.py (100%) rename {keopswindows/src/keopswindows => keopscore/keopscore/windows_compilations}/detection.py (100%) rename {keopswindows/src/keopswindows => keopscore/keopscore/windows_compilations}/globals.py (100%) rename {keopswindows/src/keopswindows => keopscore/keopscore/windows_compilations}/templates/CMakeLists.txt (100%) rename {keopswindows/src/keopswindows => keopscore/keopscore/windows_compilations}/utils.py (100%) delete mode 100644 keopswindows/pyproject.toml create mode 100644 pykeops/pykeops/common/keops_io/pykeops_nvrtc_win.cpp diff --git a/keopswindows/src/keopswindows/__init__.py b/keopscore/keopscore/windows_compilations/__init__.py similarity index 100% rename from keopswindows/src/keopswindows/__init__.py rename to keopscore/keopscore/windows_compilations/__init__.py diff --git a/keopswindows/src/keopswindows/compile.py b/keopscore/keopscore/windows_compilations/compile.py similarity index 100% rename from keopswindows/src/keopswindows/compile.py rename to keopscore/keopscore/windows_compilations/compile.py diff --git a/keopswindows/src/keopswindows/compile_nvrtc_jit.py b/keopscore/keopscore/windows_compilations/compile_nvrtc_jit.py similarity index 86% rename from keopswindows/src/keopswindows/compile_nvrtc_jit.py rename to keopscore/keopscore/windows_compilations/compile_nvrtc_jit.py index ea20ced89..7227bdd59 100644 --- a/keopswindows/src/keopswindows/compile_nvrtc_jit.py +++ b/keopscore/keopscore/windows_compilations/compile_nvrtc_jit.py @@ -12,7 +12,7 @@ def compile_nvrtc_jit(build_folder): keops_dir = Path(find_package_location("keopscore")).parent - source_file = keops_dir / "binders" / "nvrtc" / "nvrtc_jit.cpp" + source_file = keops_dir / "binders" / "nvrtc" / "nvrtc_jit_win.cpp" macros = [ "-DMAXIDGPU=0", @@ -25,6 +25,7 @@ def compile_nvrtc_jit(build_folder): compile( source_file=source_file, + project_name="nvrtc_jit", macros=macros, includes=[include_dirs[key] for key in ["keops", "cuda"]], link_dirs=[lib_dirs[key] for key in ["cuda"]], diff --git a/keopswindows/src/keopswindows/compile_pykeops_cpp_module.py b/keopscore/keopscore/windows_compilations/compile_pykeops_cpp_module.py similarity index 100% rename from keopswindows/src/keopswindows/compile_pykeops_cpp_module.py rename to keopscore/keopscore/windows_compilations/compile_pykeops_cpp_module.py diff --git a/keopswindows/src/keopswindows/compile_pykeops_nvrtc.py b/keopscore/keopscore/windows_compilations/compile_pykeops_nvrtc.py similarity index 92% rename from keopswindows/src/keopswindows/compile_pykeops_nvrtc.py rename to keopscore/keopscore/windows_compilations/compile_pykeops_nvrtc.py index 77fe5fb48..e27a09c3e 100644 --- a/keopswindows/src/keopswindows/compile_pykeops_nvrtc.py +++ b/keopscore/keopscore/windows_compilations/compile_pykeops_nvrtc.py @@ -12,7 +12,7 @@ def compile_pykeops_nvrtc(build_folder): pykeops_dir = Path(find_package_location("pykeops")).parent - source_file = pykeops_dir / "common" / "keops_io" / "pykeops_nvrtc.cpp" + source_file = pykeops_dir / "common" / "keops_io" / "pykeops_nvrtc_win.cpp" macros = [ "-DMAXIDGPU=0", @@ -25,6 +25,7 @@ def compile_pykeops_nvrtc(build_folder): compile( source_file=source_file, + project_name="pykeops_nvrtc", macros=macros, includes=[include_dirs[key] for key in ["python", "pybind11", "keops", "cuda"]], link_dirs=[lib_dirs[key] for key in ["python", "cuda"]], diff --git a/keopswindows/src/keopswindows/cuda_detection.py b/keopscore/keopscore/windows_compilations/cuda_detection.py similarity index 100% rename from keopswindows/src/keopswindows/cuda_detection.py rename to keopscore/keopscore/windows_compilations/cuda_detection.py diff --git a/keopswindows/src/keopswindows/detection.py b/keopscore/keopscore/windows_compilations/detection.py similarity index 100% rename from keopswindows/src/keopswindows/detection.py rename to keopscore/keopscore/windows_compilations/detection.py diff --git a/keopswindows/src/keopswindows/globals.py b/keopscore/keopscore/windows_compilations/globals.py similarity index 100% rename from keopswindows/src/keopswindows/globals.py rename to keopscore/keopscore/windows_compilations/globals.py diff --git a/keopswindows/src/keopswindows/templates/CMakeLists.txt b/keopscore/keopscore/windows_compilations/templates/CMakeLists.txt similarity index 100% rename from keopswindows/src/keopswindows/templates/CMakeLists.txt rename to keopscore/keopscore/windows_compilations/templates/CMakeLists.txt diff --git a/keopswindows/src/keopswindows/utils.py b/keopscore/keopscore/windows_compilations/utils.py similarity index 100% rename from keopswindows/src/keopswindows/utils.py rename to keopscore/keopscore/windows_compilations/utils.py diff --git a/keopswindows/pyproject.toml b/keopswindows/pyproject.toml deleted file mode 100644 index 0eeb255e1..000000000 --- a/keopswindows/pyproject.toml +++ /dev/null @@ -1,80 +0,0 @@ -[build-system] -requires = ["hatchling"] -build-backend = "hatchling.build" - -[project] -name = "keopswindows" -version = "0.1.0" -authors = [ - {name = "Louis Pujol"} -] -description = "KeOps on windows: adapters for KeOps to work on Windows" -requires-python = ">=3.7" -dependencies = [ - "cmake", -] - -[project.optional-dependencies] -dev = [ - "pytest", - "pytest-cov", - "pre-commit", -] - - -[tool.pytest.ini_options] -minversion = "6.0" -addopts = [ - "-ra", - "--durations=10", - "--showlocals", - "--strict-markers", - "--strict-config", - "--doctest-modules", - "--cov=keopswindows", - "--cov-report=term-missing", - "--cov-report=html", - "--cov-report=xml" - ] - -testpaths = ["src/keopswindows", "tests"] - - -# See: https://learn.scientific-python.org/development/guides/style/#PC190 -[tool.ruff] -line-length = 79 - -[tool.ruff.lint] -extend-select = [ - "B", # flake8-bugbear - "I", # isort - "ARG", # flake8-unused-arguments - "C4", # flake8-comprehensions - "EM", # flake8-errmsg - "ICN", # flake8-import-conventions - "G", # flake8-logging-format - "PGH", # pygrep-hooks - "PIE", # flake8-pie - "PL", # pylint - "PT", # flake8-pytest-style - "PTH", # flake8-use-pathlib - "RET", # flake8-return - "RUF", # Ruff-specific - "SIM", # flake8-simplify - "T20", # flake8-print - "UP", # pyupgrade - "YTT", # flake8-2020 - "EXE", # flake8-executable - "NPY", # NumPy specific rules - "FURB", # refurb - "PYI", # flake8-pyi -] - -[tool.ruff.lint.per-file-ignores] -"tests/**" = ["T201", "PLR2004"] # it is ok to print in tests + magic values in comparison -"examples/**" = ["T201"] # also ok to print in examples - - -[tool.ruff.format] -indent-style = "space" -docstring-code-format = true diff --git a/pykeops/pykeops/common/keops_io/LoadKeOps_cpp.py b/pykeops/pykeops/common/keops_io/LoadKeOps_cpp.py index 7b632f3c9..17e0b26fb 100644 --- a/pykeops/pykeops/common/keops_io/LoadKeOps_cpp.py +++ b/pykeops/pykeops/common/keops_io/LoadKeOps_cpp.py @@ -12,6 +12,7 @@ from pykeops.config import pykeops_cpp_name, python_includes + class LoadKeOps_cpp_class(LoadKeOps): def __init__(self, *args, fast_init=False): super().__init__(*args, fast_init=fast_init) @@ -33,7 +34,16 @@ def init_phase1(self): flush=True, end="", ) - KeOps_OS_Run(compile_command) + + if os.name == "nt": + from keopscore.windows_compilations import compile_pykeops_cpp_module + + compile_pykeops_cpp_module( + source_file=srcname, + build_folder=get_build_folder(), + ) + else: + KeOps_OS_Run(compile_command) pyKeOps_Message("OK", use_tag=False, flush=True) def init_phase2(self): @@ -70,7 +80,8 @@ def call_keops(self, nx, ny): ) def get_pybind11_code(self): - return f""" + if os.name != "nt": + return f""" #include "{self.params.source_name}" #include @@ -152,7 +163,7 @@ def get_pybind11_code(self): }} - return launch_keops_cpu_{self.params.tag}< TYPE >(dimY, + return launch_keops_{self.params.tag}_cpu< TYPE >(dimY, nx, ny, tagI, @@ -180,6 +191,119 @@ def get_pybind11_code(self): m.def("launch_pykeops_cpu", &launch_pykeops_{self.params.tag}_cpu < {cpp_dtype[self.params.dtype]} >, "Entry point to keops."); }} """ + + else: + return f""" +#include "{self.params.source_name}" + +#include +namespace py = pybind11; + +template < typename TYPE > +int launch_pykeops_{self.params.tag}_cpu(int dimY, int nx, int ny, + int tagI, int tagZero, int use_half, + int dimred, + int use_chunk_mode, + py::tuple py_indsi, py::tuple py_indsj, py::tuple py_indsp, + int dimout, + py::tuple py_dimsx, py::tuple py_dimsy, py::tuple py_dimsp, + py::tuple py_ranges, + py::tuple py_shapeout, + long out_void, + py::tuple py_arg, + py::tuple py_argshape){{ + + /*------------------------------------*/ + /* Cast input args */ + /*------------------------------------*/ + + std::vector< int > indsi_v(py_indsi.size()); + for (auto i = 0; i < py_indsi.size(); i++) + indsi_v[i] = py::cast< int >(py_indsi[i]); + + + std::vector< int > indsj_v(py_indsj.size()); + for (auto i = 0; i < py_indsj.size(); i++) + indsj_v[i] = py::cast< int >(py_indsj[i]); + + + std::vector< int > indsp_v(py_indsp.size()); + for (auto i = 0; i < py_indsp.size(); i++) + indsp_v[i] = py::cast< int >(py_indsp[i]); + + + std::vector< int > dimsx_v(py_dimsx.size()); + for (auto i = 0; i < py_dimsx.size(); i++) + dimsx_v[i] = py::cast< int >(py_dimsx[i]); + + + std::vector< int > dimsy_v(py_dimsy.size()); + for (auto i = 0; i < py_dimsy.size(); i++) + dimsy_v[i] = py::cast< int >(py_dimsy[i]); + + + std::vector< int > dimsp_v(py_dimsp.size()); + for (auto i = 0; i < py_dimsp.size(); i++) + dimsp_v[i] = py::cast< int >(py_dimsp[i]); + + + // Cast the ranges arrays + std::vector< int* > ranges_v(py_ranges.size()); + for (int i = 0; i < py_ranges.size(); i++) + ranges_v[i] = (int*) py::cast< long >(py_ranges[i]); + int **ranges = (int**) ranges_v.data(); + + std::vector< int > shapeout_v(py_shapeout.size()); + for (auto i = 0; i < py_shapeout.size(); i++) + shapeout_v[i] = py::cast< int >(py_shapeout[i]); + + TYPE *out = (TYPE*) out_void; + // std::cout << "out_ptr : " << (long) out << std::endl; + + std::vector< TYPE* > arg_v(py_arg.size()); + for (int i = 0; i < py_arg.size(); i++) + arg_v[i] = (TYPE*) py::cast< long >(py_arg[i]); + TYPE **arg = (TYPE**) arg_v.data(); + + std::vector< std::vector< int > > argshape_v(py_argshape.size()); + for (auto i = 0; i < py_argshape.size(); i++){{ + py::tuple tmp = py_argshape[i]; + std::vector< int > tmp_v(tmp.size()); + for (auto j =0; j < tmp.size(); j++) + tmp_v[j] = py::cast< int >(tmp[j]); + argshape_v[i] = tmp_v; + }} + + + return launch_keops_cpu_{self.params.tag}< TYPE >(dimY, + nx, + ny, + tagI, + tagZero, + use_half, + dimred, + use_chunk_mode, + indsi_v, + indsj_v, + indsp_v, + dimout, + dimsx_v, + dimsy_v, + dimsp_v, + ranges, + shapeout_v, + out, + arg, + argshape_v); + +}} + +PYBIND11_MODULE(pykeops_cpp_{self.params.tag}, m) {{ + m.doc() = "pyKeOps: KeOps for pytorch through pybind11 (pytorch flavour)."; + m.def("launch_pykeops_cpu", &launch_pykeops_{self.params.tag}_cpu < {cpp_dtype[self.params.dtype]} >, "Entry point to keops."); +}} + """.replace("long", "int64_t") + LoadKeOps_cpp = Cache_partial( diff --git a/pykeops/pykeops/common/keops_io/pykeops_nvrtc_win.cpp b/pykeops/pykeops/common/keops_io/pykeops_nvrtc_win.cpp new file mode 100644 index 000000000..20d8c3d3b --- /dev/null +++ b/pykeops/pykeops/common/keops_io/pykeops_nvrtc_win.cpp @@ -0,0 +1,148 @@ +// g++ -std=c++11 -shared -fPIC -O3 -fpermissive -lcuda -lnvrtc -L/usr/lib -L/opt/cuda/lib64 -L/opt/cuda/targets/x86_64-linux/lib/ -I/opt/cuda/targets/x86_64-linux/include/ -I /home/bcharlier/projets/keops/keops/keopscore/ -I/usr/include/python3.10/ -DnvrtcGetTARGET=nvrtcGetCUBIN -DnvrtcGetTARGETSize=nvrtcGetCUBINSize -DARCHTAG=\"sm\" keops_io.cpp -o keops_io.cpython-310-x86_64-linux-gnu.so + + +#include +#include + +namespace py = pybind11; + +template< typename TYPE > +class KeOps_module_python : public KeOps_module< TYPE > { +public: + + using KeOps_module< TYPE >::KeOps_module; + + + int operator()(int tagHostDevice, int dimY, int nx, int ny, + int tagI, int tagZero, int use_half, + int tag1D2D, int dimred, + int cuda_block_size, int use_chunk_mode, + py::tuple py_indsi, py::tuple py_indsj, py::tuple py_indsp, + int dimout, + py::tuple py_dimsx, py::tuple py_dimsy, py::tuple py_dimsp, + py::tuple py_ranges, + py::tuple py_shapeout, + py::tuple out_void, + py::tuple py_arg, + py::tuple py_argshape + ) { + + /*------------------------------------*/ + /* Cast input args */ + /*------------------------------------*/ + + std::vector< int > indsi_v(py_indsi.size()); + for (auto i = 0; i < py_indsi.size(); i++) + indsi_v[i] = py::cast< int >(py_indsi[i]); + + + std::vector< int > indsj_v(py_indsj.size()); + for (auto i = 0; i < py_indsj.size(); i++) + indsj_v[i] = py::cast< int >(py_indsj[i]); + + + std::vector< int > indsp_v(py_indsp.size()); + for (auto i = 0; i < py_indsp.size(); i++) + indsp_v[i] = py::cast< int >(py_indsp[i]); + + + std::vector< int > dimsx_v(py_dimsx.size()); + for (auto i = 0; i < py_dimsx.size(); i++) + dimsx_v[i] = py::cast< int >(py_dimsx[i]); + + + std::vector< int > dimsy_v(py_dimsy.size()); + for (auto i = 0; i < py_dimsy.size(); i++) + dimsy_v[i] = py::cast< int >(py_dimsy[i]); + + + std::vector< int > dimsp_v(py_dimsp.size()); + for (auto i = 0; i < py_dimsp.size(); i++) + dimsp_v[i] = py::cast< int >(py_dimsp[i]); + + + // Cast the ranges arrays + std::vector< int * > ranges_v(py_ranges.size()); + for (int i = 0; i < py_ranges.size(); i++) + ranges_v[i] = (int *) py::cast< int64_t >(py_ranges[i]); + int **ranges = (int **) ranges_v.data(); + + // for (auto i: ranges_v) + // std::cout << " " << (long) i << " "; + // std::cout << std::endl; + + //for (auto i=0; i<7; i++) + // std::cout << " " << (long) ranges[i] << " "; + //std::cout << std::endl; + + std::vector< int > shapeout_v(py_shapeout.size()); + for (auto i = 0; i < py_shapeout.size(); i++) + shapeout_v[i] = py::cast< int >(py_shapeout[i]); + + TYPE *out = (TYPE *)py::cast< int64_t >(out_void[0]); + // std::cout << "out_ptr : " << (long) out << std::endl; + + std::vector < TYPE * > arg_v(py_arg.size()); + for (int i = 0; i < py_arg.size(); i++) + arg_v[i] = (TYPE *) py::cast< int64_t >(py_arg[i]); + TYPE **arg = (TYPE **) arg_v.data(); + + std::vector > argshape_v(py_argshape.size()); + for (auto i = 0; i < py_argshape.size(); i++) { + py::tuple tmp = py_argshape[i]; + std::vector< int > tmp_v(tmp.size()); + for (auto j = 0; j < tmp.size(); j++) + tmp_v[j] = py::cast< int >(tmp[j]); + argshape_v[i] = tmp_v; + } + +// for (auto i : argshape_v) +// for (auto j : i) +// std::cout << j << " " ; + + return KeOps_module< TYPE >::launch_kernel(tagHostDevice, + dimY, + nx, + ny, + tagI, + tagZero, + use_half, + tag1D2D, + dimred, + cuda_block_size, + use_chunk_mode, + indsi_v, + indsj_v, + indsp_v, + dimout, + dimsx_v, + dimsy_v, + dimsp_v, + ranges, + shapeout_v, + out, + arg, + argshape_v); + } + +}; +///////////////////////////////////////////////////////////////////////////////// +// PyBind11 entry point // +///////////////////////////////////////////////////////////////////////////////// + + +PYBIND11_MODULE(pykeops_nvrtc, m) { +m.doc() = "pyKeOps: KeOps for pytorch through pybind11 (pytorch flavour)."; + +py::class_< KeOps_module_python< float > >(m, "KeOps_module_float") +.def(py::init()) +.def("__call__", &KeOps_module_python< float >::operator()); + +py::class_< KeOps_module_python< double > >(m, "KeOps_module_double") +.def(py::init()) +.def("__call__", &KeOps_module_python< double >::operator()); + +py::class_< KeOps_module_python< half2 > >(m, "KeOps_module_half2") +.def(py::init()) +.def("__call__", &KeOps_module_python< half2 >::operator()); +} From b3def29dbf0126e0d0775cbf767bc80323bf7789 Mon Sep 17 00:00:00 2001 From: Louis Pujol <52413616+Louis-Pujol@users.noreply.github.com> Date: Fri, 11 Jul 2025 09:54:22 +0200 Subject: [PATCH 07/25] add windows specific cpp files --- keopscore/keopscore/binders/LinkCompile.py | 6 +- .../binders/nvrtc/Gpu_link_compile.py | 66 +- .../binders/nvrtc/keops_nvrtc_win.cpp | 604 ++++++++++++++++++ .../keopscore/binders/nvrtc/nvrtc_jit_win.cpp | 114 ++++ keopscore/keopscore/config/__init__.py | 6 +- keopscore/keopscore/config/cuda_windows.py | 207 ++++++ keopscore/keopscore/include/CudaSizes_win.h | 96 +++ keopscore/keopscore/include/Ranges_win.h | 99 +++ keopscore/keopscore/include/Sizes_win.h | 402 ++++++++++++ .../keopscore/include/ranges_utils_win.h | 82 +++ keopscore/keopscore/include/utils_pe_win.h | 100 +++ .../common/keops_io/LoadKeOps_nvrtc.py | 8 +- 12 files changed, 1767 insertions(+), 23 deletions(-) create mode 100644 keopscore/keopscore/binders/nvrtc/keops_nvrtc_win.cpp create mode 100644 keopscore/keopscore/binders/nvrtc/nvrtc_jit_win.cpp create mode 100644 keopscore/keopscore/config/cuda_windows.py create mode 100644 keopscore/keopscore/include/CudaSizes_win.h create mode 100644 keopscore/keopscore/include/Ranges_win.h create mode 100644 keopscore/keopscore/include/Sizes_win.h create mode 100644 keopscore/keopscore/include/ranges_utils_win.h create mode 100644 keopscore/keopscore/include/utils_pe_win.h diff --git a/keopscore/keopscore/binders/LinkCompile.py b/keopscore/keopscore/binders/LinkCompile.py index aae5d70ea..e561745ce 100644 --- a/keopscore/keopscore/binders/LinkCompile.py +++ b/keopscore/keopscore/binders/LinkCompile.py @@ -84,7 +84,11 @@ def read_info(self): def write_code(self): # write the generated code in the source file ; this is used as a subfunction of compile_code f = open(self.gencode_file, "w") - f.write(self.code) + if os.name == "nt": + f.write(self.code.replace("signed long int", "int")) + else: + f.write(self.code) + f.close() def generate_code(self): diff --git a/keopscore/keopscore/binders/nvrtc/Gpu_link_compile.py b/keopscore/keopscore/binders/nvrtc/Gpu_link_compile.py index ccd6eb00d..e0b90774f 100644 --- a/keopscore/keopscore/binders/nvrtc/Gpu_link_compile.py +++ b/keopscore/keopscore/binders/nvrtc/Gpu_link_compile.py @@ -1,6 +1,5 @@ import os from ctypes import create_string_buffer, CDLL, c_int -from os import RTLD_LAZY import sysconfig from os.path import join @@ -34,10 +33,16 @@ def jit_compile_dll(): - return os.path.join( - build_folder, - "nvrtc_jit" + sysconfig.get_config_var("SHLIB_SUFFIX"), - ) + if os.name == "nt": + return os.path.join( + build_folder, + "nvrtc_jit.dll" + ) + else: + return os.path.join( + build_folder, + "nvrtc_jit" + sysconfig.get_config_var("SHLIB_SUFFIX"), + ) class Gpu_link_compile(LinkCompile): @@ -61,7 +66,10 @@ def __init__(self): self.low_level_code_prefix + self.gencode_filename, ).encode("utf-8") - self.my_c_dll = CDLL(jit_compile_dll(), mode=RTLD_LAZY) + if os.name != "nt": + self.my_c_dll = CDLL(jit_compile_dll(), mode=os.RTLD_LAZY) + else: + self.my_c_dll = CDLL(jit_compile_dll()) # actual dll to be called is the jit binary, TODO: check if this is relevent self.true_dllname = jit_binary # file to check for existence to detect compilation is needed @@ -75,16 +83,28 @@ def generate_code(self): self.write_code() # we execute the main dll, passing the code as argument, and the name of the low level code file to save the assembly instructions - res = self.my_c_dll.Compile( - create_string_buffer(self.low_level_code_file), - create_string_buffer(self.code.encode("utf-8")), - c_int(self.use_half), - c_int(self.use_fast_math), - c_int(self.device_id), - create_string_buffer( - (custom_cuda_include_fp16_path() + os.path.sep).encode("utf-8") - ), - ) + breakpoint() + if os.name != "nt": + res = self.my_c_dll.Compile( + create_string_buffer(self.low_level_code_file), + create_string_buffer(self.code.encode("utf-8")), + c_int(self.use_half), + c_int(self.use_fast_math), + c_int(self.device_id), + create_string_buffer( + (custom_cuda_include_fp16_path() + os.path.sep).encode("utf-8") + ), + ) + else: + res = self.my_c_dll.Compile( + create_string_buffer(self.low_level_code_file), + create_string_buffer(self.code.encode("utf-8")), + c_int(self.use_half), + c_int(self.device_id), + create_string_buffer( + (custom_cuda_include_fp16_path() + os.path.sep).encode("utf-8") + ), + ) if res != 0: KeOps_Error( f"Error when compiling formula (error in nvrtcCompileProgram, nvrtcResult={res})" @@ -116,8 +136,14 @@ def get_compile_command( @staticmethod def compile_jit_compile_dll(): KeOps_Message("Compiling cuda jit compiler engine ... ", flush=True, end="") - command = Gpu_link_compile.get_compile_command( - sourcename=jit_compile_src, dllname=jit_compile_dll() - ) - KeOps_OS_Run(command) + if os.name == "nt": + from ...windows_compilations import compile_nvrtc_jit + + compile_nvrtc_jit(build_folder=build_folder) + else: + command = Gpu_link_compile.get_compile_command( + sourcename=jit_compile_src, dllname=jit_compile_dll() + ) + KeOps_OS_Run(command) + KeOps_Message("OK", use_tag=False, flush=True) diff --git a/keopscore/keopscore/binders/nvrtc/keops_nvrtc_win.cpp b/keopscore/keopscore/binders/nvrtc/keops_nvrtc_win.cpp new file mode 100644 index 000000000..017201363 --- /dev/null +++ b/keopscore/keopscore/binders/nvrtc/keops_nvrtc_win.cpp @@ -0,0 +1,604 @@ + +// nvcc -shared -Xcompiler -fPIC -lnvrtc -lcuda keops_nvrtc.cu -o keops_nvrtc.so +// g++ --verbose -L/opt/cuda/lib64 -L/opt/cuda/targets/x86_64-linux/lib/ -I/opt/cuda/targets/x86_64-linux/include/ -I../../include -shared -fPIC -lcuda -lnvrtc -fpermissive -DMAXIDGPU=0 -DMAXTHREADSPERBLOCK0=1024 -DSHAREDMEMPERBLOCK0=49152 -DnvrtcGetTARGET=nvrtcGetCUBIN -DnvrtcGetTARGETSize=nvrtcGetCUBINSize -DARCHTAG=\"sm\" keops_nvrtc.cpp -o keops_nvrtc.so +// g++ -std=c++11 -shared -fPIC -O3 -fpermissive -L /usr/lib -L /opt/cuda/lib64 -lcuda -lnvrtc -DnvrtcGetTARGET=nvrtcGetCUBIN -DnvrtcGetTARGETSize=nvrtcGetCUBINSize -DARCHTAG=\"sm\" -I/home/bcharlier/projets/keops/keops/keops/include -I/opt/cuda/include -I/usr/include/python3.10/ -DMAXIDGPU=0 -DMAXTHREADSPERBLOCK0=1024 -DSHAREDMEMPERBLOCK0=49152 /home/bcharlier/projets/keops/keops/keops/binders/nvrtc/keops_nvrtc.cpp -o keops_nvrtc.cpython-310-x86_64-linux-gnu.so + +#include +#include +#include +#include +#include +#include +#include +#include +#include +//#include + +#define C_CONTIGUOUS 1 +#define USE_HALF 0 + +#include "include/Sizes_win.h" +#include "include/Ranges_win.h" +#include "include/utils_pe_win.h" +#include "include/ranges_utils_win.h" + + +#include "include/CudaSizes_win.h" +#include + + +int *build_offset_tables(int nbatchdims, int *shapes, int nblocks, int *lookup_h, + const std::vector< int > &indsi, + const std::vector< int > &indsj, + const std::vector< int > &indsp, + int tagJ) { + + int sizei = indsi.size(); + int sizej = indsj.size(); + int sizep = indsp.size(); + + // Support for broadcasting over batch dimensions ============================================= + + int sizevars = sizei + sizej + sizep; + + // Separate and store the shapes of the "i" and "j" variables + parameters -------------- + // + // shapes is an array of size (1+nargs)*(nbatchdims+3), which looks like: + // [ A, .., B, M, N, D_out] -> output + // [ A, .., B, M, 1, D_1 ] -> "i" variable + // [ A, .., B, 1, N, D_2 ] -> "j" variable + // [ A, .., B, 1, 1, D_3 ] -> "parameter" + // [ A, .., 1, M, 1, D_4 ] -> N.B.: we support broadcasting on the batch dimensions! + // [ 1, .., 1, M, 1, D_5 ] -> (we'll just ask users to fill in the shapes with *explicit* ones) + + //int shapes_i[sizei * (nbatchdims + 1)], shapes_j[sizej * (nbatchdims + 1)], shapes_p[sizep * (nbatchdims + 1)]; + std::vector shapes_i(sizei * (nbatchdims + 1)); + std::vector shapes_j(sizej * (nbatchdims + 1)); + std::vector shapes_p(sizep * (nbatchdims + 1)); + + // First, we fill shapes_i with the "relevant" shapes of the "i" variables, + // making it look like, say: + // [ A, .., B, M] + // [ A, .., 1, M] + // [ A, .., A, M] + // Then, we do the same for shapes_j, but with "N" instead of "M". + // And finally for the parameters, with "1" instead of "M". + fill_shapes(nbatchdims, shapes, shapes_i.data(), shapes_j.data(), shapes_p.data(), tagJ, indsi, indsj, indsp); + + int tagIJ = tagJ; // 1 if the reduction is made "over j", 0 if it is made "over i" + int M = shapes[nbatchdims], N = shapes[nbatchdims + 1]; + + // We create a lookup table, "offsets", of shape (nblocks, SIZEVARS) -------- + int *offsets_d = NULL; + + //int offsets_h[nblocks * sizevars]; + std::vector offsets_h(nblocks * sizevars); + + for (int k = 0; k < nblocks; k++) { + int range_id = (int) lookup_h[3 * k]; + int start_x = tagIJ ? range_id * M : range_id * N; + int start_y = tagIJ ? range_id * N : range_id * M; + + int patch_offset = (int) (lookup_h[3 * k + 1] - start_x); + + vect_broadcast_index(start_x, nbatchdims, sizei, shapes, shapes_i.data(), offsets_h.data() + k * sizevars, patch_offset); + vect_broadcast_index(start_y, nbatchdims, sizej, shapes, shapes_j.data(), offsets_h.data() + k * sizevars + sizei); + vect_broadcast_index(range_id, nbatchdims, sizep, shapes, shapes_p.data(), offsets_h.data() + k * sizevars + sizei + sizej); + } + + CUDA_SAFE_CALL(cuMemAlloc((CUdeviceptr * ) & offsets_d, sizeof(int) * nblocks * sizevars)); + CUDA_SAFE_CALL(cuMemcpyHtoD((CUdeviceptr) offsets_d, offsets_h.data(), sizeof(int) * nblocks * sizevars)); + + return offsets_d; +} + + +void range_preprocess_from_device(int &nblocks, int tagI, int nranges_x, int nranges_y, int **castedranges, + int nbatchdims, int *&slices_x_d, int *&ranges_y_d, + int *&lookup_d, int *&offsets_d, int blockSize_x, + const std::vector< int > &indsi, + const std::vector< int > &indsj, + const std::vector< int > &indsp, + int *shapes) { + + // Ranges pre-processing... ================================================================== + + // N.B.: In the following code, we assume that the x-ranges do not overlap. + // Otherwise, we'd have to assume that DIMRED == DIMOUT + // or allocate a buffer of size nx * DIMRED. This may be done in the future. + // Cf. reduction.h: + // FUN::tagJ = 1 for a reduction over j, result indexed by i + // FUN::tagJ = 0 for a reduction over i, result indexed by j + + int tagJ = 1 - tagI; + int nranges = tagJ ? nranges_x : nranges_y; + + int *ranges_x = tagJ ? castedranges[0] : castedranges[3]; + int *slices_x = tagJ ? castedranges[1] : castedranges[4]; + int *ranges_y = tagJ ? castedranges[2] : castedranges[5]; + + std::vector ranges_x_h_arr(2 * nranges); + int* ranges_x_h; + + // The code below needs a pointer to ranges_x on *host* memory, ------------------- + // as well as pointers to slices_x and ranges_y on *device* memory. + // -> Depending on the "ranges" location, we'll copy ranges_x *or* slices_x and ranges_y + // to the appropriate memory: + bool ranges_on_device = (nbatchdims == 0); + // N.B.: We only support Host ranges with Device data when these ranges were created + // to emulate block-sparse reductions. + + if (ranges_on_device) { // The ranges are on the device + ranges_x_h = &ranges_x_h_arr[0]; + // Send data from device to host. + cuMemcpyDtoH(ranges_x_h, (CUdeviceptr) ranges_x, sizeof(int) * 2 * nranges); + slices_x_d = slices_x; + ranges_y_d = ranges_y; + } else { // The ranges are on host memory; this is typically what happens with **batch processing**, + // with ranges generated by keops_io.h: + ranges_x_h = ranges_x; + // Copy "slices_x" to the device: + CUDA_SAFE_CALL(cuMemAlloc((CUdeviceptr * ) & slices_x_d, sizeof(int) * nranges)); + CUDA_SAFE_CALL(cuMemcpyHtoD((CUdeviceptr) slices_x_d, slices_x, sizeof(int) * nranges)); + + // Copy "redranges_y" to the device: with batch processing, we KNOW that they have the same shape as ranges_x + CUDA_SAFE_CALL(cuMemAlloc((CUdeviceptr * ) & ranges_y_d, sizeof(int) * 2 * nranges)); + CUDA_SAFE_CALL(cuMemcpyHtoD((CUdeviceptr) ranges_y_d, ranges_y, sizeof(int) * 2 * nranges)); + } + + // Computes the number of blocks needed --------------------------------------------- + nblocks = 0; + int len_range = 0; + for (int i = 0; i < nranges; i++) { + len_range = ranges_x_h[2 * i + 1] - ranges_x_h[2 * i]; + nblocks += (len_range / blockSize_x) + (len_range % blockSize_x == 0 ? 0 : 1); + } + + // Create a lookup table for the blocks -------------------------------------------- + std::vector lookup_h(3 * nblocks); + int index = 0; + + for (int i = 0; i < nranges; i++) { + len_range = ranges_x_h[2 * i + 1] - ranges_x_h[2 * i]; + for (int j = 0; j < len_range; j += blockSize_x) { + lookup_h[3 * index] = i; + lookup_h[3 * index + 1] = ranges_x_h[2 * i] + j; + lookup_h[3 * index + 2] = ranges_x_h[2 * i] + j + std::min((int) blockSize_x, len_range - j); + index++; + } + } + + // Load the table on the device ----------------------------------------------------- + CUDA_SAFE_CALL(cuMemAlloc((CUdeviceptr * ) &lookup_d, sizeof(int) * 3 * nblocks)); + CUDA_SAFE_CALL(cuMemcpyHtoD((CUdeviceptr) lookup_d, lookup_h.data(), sizeof(int) * 3 * nblocks)); + + + // Support for broadcasting over batch dimensions ============================================= + + // We create a lookup table, "offsets", of shape (nblock, SIZEVARS): + + if (nbatchdims > 0) { + offsets_d = build_offset_tables(nbatchdims, shapes, nblocks, lookup_h.data(), + indsi, indsj, indsp, tagJ); + } + + +} + + +void +range_preprocess_from_host(int &nblocks, int tagI, int nranges_x, int nranges_y, int nredranges_x, int nredranges_y, + int **castedranges, + int nbatchdims, int *&slices_x_d, int *&ranges_y_d, + int *&lookup_d, int *&offsets_d, int blockSize_x, + const std::vector< int > &indsi, + const std::vector< int > &indsj, + const std::vector< int > &indsp, + int *shapes) { + + // Ranges pre-processing... ================================================================== + + // N.B.: In the following code, we assume that the x-ranges do not overlap. + // Otherwise, we'd have to assume that DIMRED == DIMOUT + // or allocate a buffer of size nx * DIMRED. This may be done in the future. + // Cf. reduction.h: + // FUN::tagJ = 1 for a reduction over j, result indexed by i + // FUN::tagJ = 0 for a reduction over i, result indexed by j + + int tagJ = 1 - tagI; + int nranges = tagJ ? nranges_x : nranges_y; + int nredranges = tagJ ? nredranges_y : nredranges_x; + + int *ranges_x = tagJ ? castedranges[0] : castedranges[3]; + int *slices_x = tagJ ? castedranges[1] : castedranges[4]; + int *ranges_y = tagJ ? castedranges[2] : castedranges[5]; + + // Computes the number of blocks needed --------------------------------------------- + nblocks = 0; + int len_range = 0; + for (int i = 0; i < nranges; i++) { + len_range = ranges_x[2 * i + 1] - ranges_x[2 * i]; + nblocks += (len_range / blockSize_x) + (len_range % blockSize_x == 0 ? 0 : 1); + } + + // Create a lookup table for the blocks -------------------------------------------- + std::vector lookup_h(3 * nblocks); + int index = 0; + + for (int i = 0; i < nranges; i++) { + len_range = ranges_x[2 * i + 1] - ranges_x[2 * i]; + for (int j = 0; j < len_range; j += blockSize_x) { + lookup_h[3 * index] = i; + lookup_h[3 * index + 1] = ranges_x[2 * i] + j; + lookup_h[3 * index + 2] = ranges_x[2 * i] + j + std::min((int) blockSize_x, len_range - j); + index++; + } + } + + // Load the table on the device ----------------------------------------------------- + CUDA_SAFE_CALL(cuMemAlloc((CUdeviceptr * ) & lookup_d, sizeof(int) * 3 * nblocks)); + CUDA_SAFE_CALL(cuMemcpyHtoD((CUdeviceptr) lookup_d, lookup_h.data(), sizeof(int) * 3 * nblocks)); + + // Send data from host to device: + CUDA_SAFE_CALL(cuMemAlloc((CUdeviceptr * ) & slices_x_d, sizeof(int) * 2 * nranges)); + CUDA_SAFE_CALL(cuMemcpyHtoD((CUdeviceptr) slices_x_d, slices_x, sizeof(int) * 2 * nranges)); + + CUDA_SAFE_CALL(cuMemAlloc((CUdeviceptr * ) & ranges_y_d, sizeof(int) * 2 * nredranges)); + CUDA_SAFE_CALL(cuMemcpyHtoD((CUdeviceptr) ranges_y_d, ranges_y, sizeof(int) * 2 * nredranges)); + + + // Support for broadcasting over batch dimensions ============================================= + + // We create a lookup table, "offsets", of shape (nblock, SIZEVARS): + + if (nbatchdims > 0) { + offsets_d = build_offset_tables(nbatchdims, shapes, nblocks, lookup_h.data(), + indsi, indsj, indsp, tagJ); + } + + +} + + +template< typename TYPE > +class KeOps_module { +public : + + CUdevice cuDevice; + CUcontext ctx; + CUmodule module; + char *target; + CUdeviceptr buffer; + int nargs; + + void SetContext() { + CUcontext current_ctx; + CUDA_SAFE_CALL_NO_EXCEPTION(cuCtxGetCurrent(¤t_ctx)); + if (current_ctx != ctx) + CUDA_SAFE_CALL_NO_EXCEPTION(cuCtxPushCurrent(ctx)); + CUDA_SAFE_CALL_NO_EXCEPTION(cuCtxGetCurrent(¤t_ctx)); + } + + + void Read_Target(const char *target_file_name) { + std::ifstream rf(target_file_name, std::ifstream::binary); + size_t targetSize; + rf.read((char *) &targetSize, sizeof(size_t)); + target = new char[targetSize]; + rf.read(target, targetSize); + rf.close(); + + } + + + KeOps_module(int device_id, int nargs_, const char *target_file_name) { + + nargs = nargs_; + + // init cuda in case not already done + CUDA_SAFE_CALL(cuInit(0)); + + // get the device and the primary context corresponding to device_id + CUDA_SAFE_CALL(cuDeviceGet(&cuDevice, device_id)); + CUDA_SAFE_CALL(cuDevicePrimaryCtxRetain(&ctx, cuDevice)); + + // set the primary context as the active current context + SetContext(); + + // set global variables giving some properties of device + SetGpuProps(device_id); + + // read the ptx or cubin file into a char array + Read_Target(target_file_name); + + // load the corresponding module + CUDA_SAFE_CALL(cuModuleLoadDataEx(&module, target, 0, NULL, NULL)); + + // allocate a small memory buffer for "on device" computation mode, + // This is just used for storing the list of pointers to device data + // as a device array ; it is better to allocate it here once for all, + // otherwise allocating it at each call may cause a small overhead. + CUDA_SAFE_CALL(cuMemAlloc(&buffer, nargs * sizeof(TYPE *))); + + } + + + ~KeOps_module() { + SetContext(); + CUDA_SAFE_CALL_NO_EXCEPTION(cuMemFree(buffer)); + CUDA_SAFE_CALL_NO_EXCEPTION(cuModuleUnload(module)); + CUDA_SAFE_CALL_NO_EXCEPTION(cuDevicePrimaryCtxRelease(cuDevice)); + delete[] target; + } + + int launch_kernel(int tagHostDevice, int dimY, int nx, int ny, + int tagI, int tagZero, int use_half, + int tag1D2D, int dimred, + int cuda_block_size, int use_chunk_mode, + std::vector< int > indsi, std::vector< int > indsj, std::vector< int > indsp, + int dimout, + std::vector< int > dimsx, std::vector< int > dimsy, std::vector< int > dimsp, + int **ranges, + std::vector< int > shapeout, TYPE *out, + TYPE **arg, + std::vector > argshape + ) { + + + SetContext(); + + ////end_ = clock(); + ////std::cout << " time for set device : " << double(//end_ - start_) / CLOCKS_PER_SEC << std::endl; + //start_ = clock(); + + Sizes SS(nargs, arg, argshape, nx, ny, + tagI, use_half, + dimout, + indsi, indsj, indsp, + dimsx, dimsy, dimsp); + + //end_ = clock(); + //std::cout << " time for Sizes : " << double(//end_ - start_) / CLOCKS_PER_SEC << std::endl; + //start_ = clock(); + + if (use_half) + SS.switch_to_half2_indexing(); + + Ranges RR(SS, ranges); + nx = SS.nx; + ny = SS.ny; + + //end_ = clock(); + //std::cout << " time for Ranges : " << double(//end_ - start_) / CLOCKS_PER_SEC << std::endl; + //start_ = clock(); + + // now we switch (back...) indsi, indsj and dimsx, dimsy in case tagI=1. + // This is to be consistent with the convention used in the old + // bindings where i and j variables had different meanings in bindings + // and in the core code. Clearly we could do better if we + // carefully rewrite some parts of the code + if (tagI == 1) { + std::vector< int > tmp; + + tmp = indsj; + indsj = indsi; + indsi = tmp; + + tmp = dimsy; + dimsy = dimsx; + dimsx = tmp; + } + + + int blockSize_x = 1, blockSize_y = 1, blockSize_z = 1; + + if (use_chunk_mode == 0) { + // warning : blockSize.x was previously set to CUDA_BLOCK_SIZE; currently CUDA_BLOCK_SIZE value is used as a bound. + blockSize_x = std::min(cuda_block_size, + std::min(maxThreadsPerBlock, + (int) (sharedMemPerBlock / std::max(1, (int) (dimY * sizeof(TYPE)))) + ) + ); // number of threads in each block + } else { + // warning : the value here must match the one which is set in file GpuReduc1D_chunks.py, line 59 + // and file GpuReduc1D_finalchunks.py, line 67 + blockSize_x = std::min(cuda_block_size, + std::min(1024, (int) (49152 / std::max(1, (int) (dimY * sizeof(TYPE))))) + ); + } + + int nblocks; + + if (tagI == 1) { + int tmp = ny; + ny = nx; + nx = tmp; + } + + int *lookup_d = NULL, *slices_x_d = NULL, *ranges_y_d = NULL; + int *offsets_d = NULL; + + if (RR.tagRanges == 1) { + if (tagHostDevice == 1) { + range_preprocess_from_device(nblocks, tagI, RR.nranges_x, RR.nranges_y, RR.castedranges, + SS.nbatchdims, slices_x_d, ranges_y_d, lookup_d, + offsets_d, + blockSize_x, indsi, indsj, indsp, SS.shapes); + } else { // tagHostDevice==0 + range_preprocess_from_host(nblocks, tagI, RR.nranges_x, RR.nranges_y, RR.nredranges_x, RR.nredranges_y, + RR.castedranges, + SS.nbatchdims, slices_x_d, ranges_y_d, lookup_d, + offsets_d, + blockSize_x, indsi, indsj, indsp, SS.shapes); + } + } + + ////end_ = clock(); + ////std::cout << " time for interm : " << double(//end_ - start_) / CLOCKS_PER_SEC << std::endl; + //start_ = clock(); + + CUdeviceptr p_data; + TYPE *out_d; + TYPE **arg_d; + + int sizeout = std::accumulate(shapeout.begin(), shapeout.end(), 1, std::multiplies< int >()); + + if (tagHostDevice == 1) { + p_data = buffer; + load_args_FromDevice(p_data, out, out_d, nargs, arg, arg_d); + } else + load_args_FromHost(p_data, out, out_d, nargs, arg, arg_d, argshape, sizeout); + + ////end_ = clock(); + ////std::cout << " time for load_args : " << double(//end_ - start_) / CLOCKS_PER_SEC << std::endl; + //start_ = clock(); + + CUfunction kernel; + + int gridSize_x = 1, gridSize_y = 1, gridSize_z = 1; + + if (tag1D2D == 1) { // 2D scheme + + gridSize_x = nx / blockSize_x + (nx % blockSize_x == 0 ? 0 : 1); + gridSize_y = ny / blockSize_x + (ny % blockSize_x == 0 ? 0 : 1); + + // Reduce : grid and block are both 1d + int blockSize2_x = 1, blockSize2_y = 1, blockSize2_z = 1; + blockSize2_x = blockSize_x; // number of threads in each block + int gridSize2_x = 1, gridSize2_y = 1, gridSize2_z = 1; + gridSize2_x = (nx * dimred) / blockSize2_x + ((nx * dimred) % blockSize2_x == 0 ? 0 : 1); + + // Data on the device. We need an "inflated" outB, which contains gridSize.y "copies" of out + // that will be reduced in the final pass. + TYPE *outB; + + // single cudaMalloc + CUdeviceptr p_data_outB; + CUDA_SAFE_CALL(cuMemAlloc(&p_data_outB, sizeof(TYPE) * (nx * dimred * gridSize_y))); + + outB = (TYPE *) ((TYPE **) p_data); + + CUDA_SAFE_CALL(cuModuleGetFunction(&kernel, module, "GpuConv2DOnDevice")); + + void *kernel_params[4]; + kernel_params[0] = &nx; + kernel_params[1] = &ny; + kernel_params[2] = &outB; + kernel_params[3] = &arg_d; + + // Size of the SharedData : blockSize.x*(DIMY)*sizeof(TYPE) + + CUDA_SAFE_CALL(cuLaunchKernel(kernel, + gridSize_x, gridSize_y, gridSize_z, // grid dim + blockSize_x, blockSize_y, blockSize_z, // block dim + blockSize_x * dimY * sizeof(TYPE), NULL, // shared mem and stream + kernel_params, 0)); + // block until the device has completed + CUDA_SAFE_CALL(cuCtxSynchronize()); + + // Since we've used a 2D scheme, there's still a "blockwise" line reduction to make on + // the output array px_d[0] = x1B. We go from shape ( gridSize.y * nx, DIMRED ) to (nx, DIMOUT) + CUfunction kernel_reduce; + CUDA_SAFE_CALL(cuModuleGetFunction(&kernel_reduce, module, "reduce2D")); + void *kernel_reduce_params[4]; + kernel_reduce_params[0] = &outB; + kernel_reduce_params[1] = &out_d; + kernel_reduce_params[2] = &gridSize_y; + kernel_reduce_params[3] = &nx; + + CUDA_SAFE_CALL(cuLaunchKernel(kernel_reduce, + gridSize2_x, gridSize2_y, gridSize2_z, // grid dim + blockSize2_x, blockSize2_y, blockSize2_z, // block dim + 0, NULL, // shared mem and stream + kernel_reduce_params, 0)); + + + } else if (RR.tagRanges == 1 && tagZero == 0) { + // ranges mode + + gridSize_x = nblocks; + + CUDA_SAFE_CALL(cuModuleGetFunction(&kernel, module, "GpuConv1DOnDevice_ranges")); + // std::cout << "GpuConv1DOnDevice_ranges " << nx << " " << gridSize_x ; + void *kernel_params[9]; + kernel_params[0] = &nx; + kernel_params[1] = &ny; + kernel_params[2] = &SS.nbatchdims; + kernel_params[3] = &offsets_d; + kernel_params[4] = &lookup_d; + kernel_params[5] = &slices_x_d; + kernel_params[6] = &ranges_y_d; + kernel_params[7] = &out_d; + kernel_params[8] = &arg_d; + + CUDA_SAFE_CALL(cuLaunchKernel(kernel, + gridSize_x, gridSize_y, gridSize_z, // grid dim + blockSize_x, blockSize_y, blockSize_z, // block dim + blockSize_x * dimY * sizeof(TYPE), NULL, // shared mem and stream + kernel_params, 0)); // arguments + + } else { + // simple mode + + gridSize_x = nx / blockSize_x + (nx % blockSize_x == 0 ? 0 : 1); + + CUDA_SAFE_CALL(cuModuleGetFunction(&kernel, module, "GpuConv1DOnDevice")); + + void *kernel_params[4]; + kernel_params[0] = &nx; + kernel_params[1] = &ny; + kernel_params[2] = &out_d; + kernel_params[3] = &arg_d; + + //std::cout << "GpuConv1DOnDevice " << nx << " " << gridSize_x ;//<< " " << gridSize_y << " " << gridSize_z << " " << blockSize_x << " " << blockSize_y << " " << blockSize_z << " " << blockSize_x * dimY * sizeof(TYPE) << std::endl; + + CUDA_SAFE_CALL(cuLaunchKernel(kernel, + gridSize_x, gridSize_y, gridSize_z, // grid dim + blockSize_x, blockSize_y, blockSize_z, // block dim + blockSize_x * dimY * sizeof(TYPE), NULL, // shared mem and stream + kernel_params, 0)); // arguments + } + + CUDA_SAFE_CALL(cuCtxSynchronize()); + + ////end_ = clock(); + ////std::cout << " time for kernel : " << double(//end_ - start_) / CLOCKS_PER_SEC << std::endl; + //start_ = clock(); + + // Send data from device to host. + + + if (tagHostDevice == 0) { + + CUDA_SAFE_CALL(cuMemcpyDtoH(out, (CUdeviceptr) out_d, sizeof(TYPE) * sizeout)); + CUDA_SAFE_CALL(cuMemFree(p_data)); + + } + + if (RR.tagRanges == 1) { + CUDA_SAFE_CALL(cuMemFree((CUdeviceptr) lookup_d)); + if (SS.nbatchdims > 0) { + CUDA_SAFE_CALL(cuMemFree((CUdeviceptr) slices_x_d)); + CUDA_SAFE_CALL(cuMemFree((CUdeviceptr) ranges_y_d)); + CUDA_SAFE_CALL(cuMemFree((CUdeviceptr) offsets_d)); + } + } + + //end_ = end = clock(); + ////std::cout << " time for last part : " << double(//end_ - start_) / CLOCKS_PER_SEC << std::endl; + ////std::cout << "time for launch_keops inner : " << double(end - start) / CLOCKS_PER_SEC << std::endl; + + return 0; + } + +}; + + +template +class KeOps_module< float >; + +template +class KeOps_module< double >; + +template +class KeOps_module< half2 >; diff --git a/keopscore/keopscore/binders/nvrtc/nvrtc_jit_win.cpp b/keopscore/keopscore/binders/nvrtc/nvrtc_jit_win.cpp new file mode 100644 index 000000000..39bb6c72c --- /dev/null +++ b/keopscore/keopscore/binders/nvrtc/nvrtc_jit_win.cpp @@ -0,0 +1,114 @@ +// nvcc -shared -Xcompiler -fPIC -lnvrtc -lcuda keops_nvrtc.cu -o keops_nvrtc.so +// g++ --verbose -L/opt/cuda/lib64 -L/opt/cuda/targets/x86_64-linux/lib/ -I/opt/cuda/targets/x86_64-linux/include/ -I../../include -shared -fPIC -lcuda -lnvrtc -fpermissive -DMAXIDGPU=0 -DMAXTHREADSPERBLOCK0=1024 -DSHAREDMEMPERBLOCK0=49152 -DnvrtcGetTARGET=nvrtcGetCUBIN -DnvrtcGetTARGETSize=nvrtcGetCUBINSize -DARCHTAG=\"sm\" keops_nvrtc.cpp -o keops_nvrtc.so +// g++ -std=c++11 -shared -fPIC -O3 -fpermissive -L /usr/lib -L /opt/cuda/lib64 -lcuda -lnvrtc -DnvrtcGetTARGET=nvrtcGetCUBIN -DnvrtcGetTARGETSize=nvrtcGetCUBINSize -DARCHTAG=\"sm\" -I/home/bcharlier/projets/keops/keops/keops/include -I/opt/cuda/include -I/usr/include/python3.10/ -DMAXIDGPU=0 -DMAXTHREADSPERBLOCK0=1024 -DSHAREDMEMPERBLOCK0=49152 /home/bcharlier/projets/keops/keops/keops/binders/nvrtc/keops_nvrtc.cpp -o keops_nvrtc.cpython-310-x86_64-linux-gnu.so + +#include +#include +#include +#include +#include +#include +#include +#include +#include +//#include + +#define C_CONTIGUOUS 1 +#define USE_HALF 0 + +#ifdef _WIN32 +#define DLL_EXPORT extern "C" __declspec(dllexport) +#else +#define DLL_EXPORT extern "C" +#endif + +#include "include/Sizes_win.h" +#include "include/Ranges_win.h" +#include "include/utils_pe_win.h" +#include "include/ranges_utils_win.h" + + +#include "include/CudaSizes_win.h" +#include + +DLL_EXPORT +int Compile(const char *target_file_name, const char *cu_code, int use_half, int device_id, + const char *cuda_include_path) { + + nvrtcProgram prog; + + int numHeaders; + const char *header_names[2]; + const char *header_sources[2]; + + std::ostringstream cuda_fp16_h_path, cuda_fp16_hpp_path; + cuda_fp16_h_path << cuda_include_path << "cuda_fp16.h" ; + cuda_fp16_hpp_path << cuda_include_path << "cuda_fp16.hpp" ; + + if (use_half) { + numHeaders = 2; + header_names[0] = "cuda_fp16.h"; + header_sources[0] = read_text_file(cuda_fp16_h_path.str().c_str()); + + header_names[1] = "cuda_fp16.hpp"; + header_sources[1] = read_text_file(cuda_fp16_hpp_path.str().c_str()); + + } else { + numHeaders = 0; + } + + // Get device id from Driver API + CUdevice cuDevice; + CUDA_SAFE_CALL(cuDeviceGet(&cuDevice, device_id)); + + // Get Compute Capability from Driver API + int deviceProp_major, deviceProp_minor; + CUDA_SAFE_CALL(cuDeviceGetAttribute(&deviceProp_major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice)); + CUDA_SAFE_CALL(cuDeviceGetAttribute(&deviceProp_minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice)); + + std::ostringstream arch_flag; + arch_flag << "-arch=" << ARCHTAG << "_" << deviceProp_major << deviceProp_minor; + + char *arch_flag_char = new char[arch_flag.str().length()]; + arch_flag_char = strdup(arch_flag.str().c_str()); + const char *opts[] = {arch_flag_char, "-use_fast_math"}; + + NVRTC_SAFE_CALL(nvrtcCreateProgram(&prog, // prog + cu_code, // buffer + NULL, // name + numHeaders, // numHeaders + header_sources, // headers + header_names // includeNames + )); + + nvrtcResult compileResult = nvrtcCompileProgram(prog, // prog + 2, // numOptions + opts); // options + + if (compileResult != NVRTC_SUCCESS) { + throw std::runtime_error("[KeOps] Error when compiling formula (error in nvrtcCompileProgram)."); + } + + delete[] arch_flag_char; + + // Obtain PTX or CUBIN from the program. + size_t targetSize; + NVRTC_SAFE_CALL(nvrtcGetTARGETSize(prog, &targetSize)); + + char *target = new char[targetSize]; + NVRTC_SAFE_CALL(nvrtcGetTARGET(prog, target)); + + // Destroy the program. + NVRTC_SAFE_CALL(nvrtcDestroyProgram(&prog)); + + // write PTX code to file + + std::ofstream wf(target_file_name, std::ofstream::binary); + wf.write((char*)&targetSize, sizeof(size_t)); + wf.write(target, targetSize); + wf.close(); + + delete[] target; + + return 0; +} diff --git a/keopscore/keopscore/config/__init__.py b/keopscore/keopscore/config/__init__.py index fc31a0b9f..314189442 100644 --- a/keopscore/keopscore/config/__init__.py +++ b/keopscore/keopscore/config/__init__.py @@ -1,6 +1,10 @@ # Import the configuration classes from .base_config import Config -from .cuda import CUDAConfig +import os +if os.name != "nt": + from .cuda import CUDAConfig +else: + from .cuda_windows import CUDAConfigWin as CUDAConfig from .openmp import OpenMPConfig from .Platform import DetectPlatform diff --git a/keopscore/keopscore/config/cuda_windows.py b/keopscore/keopscore/config/cuda_windows.py new file mode 100644 index 000000000..5431f8b9c --- /dev/null +++ b/keopscore/keopscore/config/cuda_windows.py @@ -0,0 +1,207 @@ +import os +import ctypes +from ctypes.util import find_library +from ctypes import ( + c_int, + c_void_p, + c_char_p, + CDLL, + byref, + cast, + POINTER, + Structure, + RTLD_GLOBAL, +) +from pathlib import Path +import shutil +from os.path import join +import platform +import tempfile +import subprocess +import sys +import keopscore +from keopscore.utils.misc_utils import KeOps_Warning +from keopscore.utils.misc_utils import KeOps_OS_Run +from keopscore.utils.misc_utils import CHECK_MARK, CROSS_MARK + +from .cuda import CUDAConfig + +from ..windows_compilations import cuda_detection + + + +detection = cuda_detection.detect_cuda_toolkit() +# cuda_lib = detection['lib_dirs'] +# cuda_include = detection['include_dir'] +# cuda_dll = detection['dll_cuda'] +# cudart_dll = detection['dll_cudart'] +# cuda_nvrtc = detection['dll_nvrtc'] + + +class CUDAConfigWin(CUDAConfig): + """ + Class for CUDA detection on windows and configuration. + """ + + # CUDA constants + CUDA_SUCCESS = 0 + CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1 + CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8 + + def set_use_cuda(self): + """Determine and set whether to use CUDA.""" + self._use_cuda = cuda_detection.cuda_available + if not self._cuda_libraries_available(): + self._use_cuda = False + + self.get_cuda_version() + self.get_cuda_include_path() + self.get_gpu_props() + + if self.n_gpus == 0 and self._use_cuda: + self._use_cuda = False + self.cuda_message = "CUDA libraries detected, but no GPUs found on this system; Switching to CPU only." + KeOps_Warning(self.cuda_message) + + def _cuda_libraries_available(self): + """ + Check if both cuda and nvrtc libraries are available. + Returns: + True if both cuda and nvrtc are loadable, False otherwise. + This is also where we handle one single warning if needed. + """ + + return 'dll_nvrtc' in detection and 'dll_cuda' in detection + + + def get_cuda_version(self, out_type="single_value"): + + if not self._use_cuda: + self.cuda_version = None + return None + try: + + libcudart = ctypes.CDLL(detection['dll_cudart']) + cuda_version = ctypes.c_int() + libcudart.cudaRuntimeGetVersion(ctypes.byref(cuda_version)) + cuda_version_value = int(cuda_version.value) + + if out_type == "single_value": + self.cuda_version = cuda_version_value + return cuda_version_value + + major = cuda_version_value // 1000 + minor = (cuda_version_value % 1000) // 10 + + if out_type == "major,minor": + return major, minor + elif out_type == "string": + return f"{major}.{minor}" + except Exception as e: + KeOps_Warning(f"Could not determine CUDA version: {e}") + self.cuda_version = None + return None + + + def get_gpu_props(self): + """ + Getting GPU properties and related attributes. + """ + if not self._use_cuda: + # Already determined that CUDA is unavailable + self.n_gpus = 0 + self.gpu_compile_flags = "" + return (self.n_gpus, self.gpu_compile_flags) + + # Attempt to load the CUDA driver library + libcuda_path = detection['dll_cuda'] + + # We have a handle, let's proceed + libcuda = ctypes.CDLL(libcuda_path) + result = libcuda.cuInit(0) + if result != self.CUDA_SUCCESS: + KeOps_Warning( + "CUDA was detected, but driver API could not be initialized. Switching to CPU only." + ) + self.n_gpus = 0 + self.gpu_compile_flags = "" + self._use_cuda = False + return (self.n_gpus, self.gpu_compile_flags) + + # Get GPU count + nGpus = ctypes.c_int() + result = libcuda.cuDeviceGetCount(ctypes.byref(nGpus)) + if result != self.CUDA_SUCCESS: + KeOps_Warning( + "CUDA was detected and driver API was initialized, but no working GPU found. " + "Switching to CPU only." + ) + self.n_gpus = 0 + self.gpu_compile_flags = "" + self._use_cuda = False + return (self.n_gpus, self.gpu_compile_flags) + + self.n_gpus = nGpus.value + if self.n_gpus == 0: + self.gpu_compile_flags = "" + return (self.n_gpus, self.gpu_compile_flags) + + # Query each GPU for properties + MaxThreadsPerBlock = [0] * self.n_gpus + SharedMemPerBlock = [0] * self.n_gpus + + def safe_call(dev_idx, result_code): + if result_code != self.CUDA_SUCCESS: + KeOps_Warning( + f"Error detecting properties for GPU device {dev_idx}. " + "Switching to CPU only." + ) + return False + return True + + for d in range(self.n_gpus): + device = ctypes.c_int() + if not safe_call(d, libcuda.cuDeviceGet(ctypes.byref(device), d)): + self.n_gpus = 0 + self.gpu_compile_flags = "" + self._use_cuda = False + return (self.n_gpus, self.gpu_compile_flags) + + output = ctypes.c_int() + if not safe_call( + d, + libcuda.cuDeviceGetAttribute( + byref(output), + self.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, + device, + ), + ): + self.n_gpus = 0 + self.gpu_compile_flags = "" + self._use_cuda = False + return (self.n_gpus, self.gpu_compile_flags) + MaxThreadsPerBlock[d] = output.value + + if not safe_call( + d, + libcuda.cuDeviceGetAttribute( + byref(output), + self.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, + device, + ), + ): + self.n_gpus = 0 + self.gpu_compile_flags = "" + self._use_cuda = False + return (self.n_gpus, self.gpu_compile_flags) + SharedMemPerBlock[d] = output.value + + # Build compile flags string + self.gpu_compile_flags = f"-DMAXIDGPU={self.n_gpus - 1} " + for d in range(self.n_gpus): + self.gpu_compile_flags += ( + f"-DMAXTHREADSPERBLOCK{d}={MaxThreadsPerBlock[d]} " + ) + self.gpu_compile_flags += f"-DSHAREDMEMPERBLOCK{d}={SharedMemPerBlock[d]} " + + return self.n_gpus, self.gpu_compile_flags diff --git a/keopscore/keopscore/include/CudaSizes_win.h b/keopscore/keopscore/include/CudaSizes_win.h new file mode 100644 index 000000000..09862a030 --- /dev/null +++ b/keopscore/keopscore/include/CudaSizes_win.h @@ -0,0 +1,96 @@ +#pragma once + +///////////////////////////////////////////// +// GPU Options // +///////////////////////////////////////////// + + + +// fix some Gpu properties +// CUDA_BLOCK_SIZE gives an upper bound on size of the size of Cuda blocks +// The actual block size may be lower due to memory limitations, depending on the formula used +#ifndef CUDA_BLOCK_SIZE +#define CUDA_BLOCK_SIZE 192 +#endif +// Here we define the maximum number of threads per block and the shared memory per block +// These values can depend on the Gpu, although in fact values 1024 and 49152 respectively +// are the good values for almost all cards. +// So these values should be fine, but you can check them with GetGpuProps.cu program +// Here we assume that: either the user has defined MAXIDGPU (=number of Gpu devices minus one) +// and corresponding specific values MAXTHREADSPERBLOCK0, SHAREDMEMPERBLOCK0, MAXTHREADSPERBLOCK1, SHAREDMEMPERBLOCK1, ... +// for each device, or MAXIDGPU is not defined, and we will use global MAXTHREADSPERBLOCK and SHAREDMEMPERBLOCK +#ifndef MAXIDGPU +// we give default values +#ifndef MAXTHREADSPERBLOCK +#define MAXTHREADSPERBLOCK 1024 +#endif +#ifndef SHAREDMEMPERBLOCK +#define SHAREDMEMPERBLOCK 49152 +#endif +#endif + +// global variables maxThreadsPerBlock and sharedMemPerBlock may depend on the device, so we will set them at each call using +// predefined MAXTHREADSPERBLOCK0, SHAREDMEMPERBLOCK0, MAXTHREADSPERBLOCK1, SHAREDMEMPERBLOCK1, etc. +// through the function SetGpuProps +int maxThreadsPerBlock, sharedMemPerBlock; + +#define SET_GPU_PROPS_MACRO(n) \ + if(device == n) { \ + maxThreadsPerBlock = MAXTHREADSPERBLOCK ## n; \ + sharedMemPerBlock = SHAREDMEMPERBLOCK ## n; \ + return; \ + } + +// I have not managed to use a "recursive macro" hack, it was not compiling on all systems. +// This assumes the number of Gpus is <= 10 ; feel free to add more lines if needed ! +void SetGpuProps(int device) { + +#if defined(MAXTHREADSPERBLOCK) && defined(SHAREDMEMPERBLOCK) + // global values are defined + maxThreadsPerBlock = MAXTHREADSPERBLOCK; + sharedMemPerBlock = SHAREDMEMPERBLOCK; + return; +#else +#if MAXIDGPU >= 0 + SET_GPU_PROPS_MACRO(0) +#endif +#if MAXIDGPU >= 1 + SET_GPU_PROPS_MACRO(1) +#endif +#if MAXIDGPU >= 2 + SET_GPU_PROPS_MACRO(2) +#endif +#if MAXIDGPU >= 3 + SET_GPU_PROPS_MACRO(3) +#endif +#if MAXIDGPU >= 4 + SET_GPU_PROPS_MACRO(4) +#endif +#if MAXIDGPU >= 5 + SET_GPU_PROPS_MACRO(5) +#endif +#if MAXIDGPU >= 6 + SET_GPU_PROPS_MACRO(6) +#endif +#if MAXIDGPU >= 7 + SET_GPU_PROPS_MACRO(7) +#endif +#if MAXIDGPU >= 8 + SET_GPU_PROPS_MACRO(8) +#endif +#if MAXIDGPU >= 9 + SET_GPU_PROPS_MACRO(9) +#endif +#if MAXIDGPU >= 10 + SET_GPU_PROPS_MACRO(10) +#endif +#if MAXIDGPU >= 11 + SET_GPU_PROPS_MACRO(11) +#endif + fprintf( stderr, "invalid Gpu device number. If the number of available Gpus is > 12, add required lines at the end of function SetGpuProps and recompile.\n"); + throw std::runtime_error("[KeOps] Cuda error."); +#endif + +} + + diff --git a/keopscore/keopscore/include/Ranges_win.h b/keopscore/keopscore/include/Ranges_win.h new file mode 100644 index 000000000..19270499e --- /dev/null +++ b/keopscore/keopscore/include/Ranges_win.h @@ -0,0 +1,99 @@ +#pragma once + +#include "Sizes_win.h" + +template< typename TYPE > +class Ranges { +public: + int tagRanges, nranges_x, nranges_y, nredranges_x, nredranges_y; + + std::vector< int > ranges_i, slices_i, redranges_j; + std::vector< int * > _castedranges; + int **castedranges; + + Ranges(Sizes< TYPE > sizes, int **ranges) { + + _castedranges.resize(6); + + // Sparsity: should we handle ranges? ====================================== + if (sizes.nbatchdims == 0) { // Standard M-by-N computation + if (ranges[6][0] == -1) { + tagRanges = 0; + + nranges_x = 0; + nranges_y = 0; + + nredranges_x = 0; + nredranges_y = 0; + + } else { + tagRanges = 1; + nranges_x = ranges[6][0]; + nranges_y = ranges[6][3]; + nredranges_x = ranges[6][5]; + nredranges_y = ranges[6][2]; + + // get the pointers to data to avoid a copy + for (int i = 0; i < 6; i++) { + _castedranges[i] = ranges[i]; + } + } + + } else if (ranges[6][0] == -1) { + // Batch processing: we'll have to generate a custom, block-diagonal sparsity pattern + tagRanges = 1; // Batch processing is emulated through the block-sparse mode + + // Create new "castedranges" from scratch ------------------------------ + // With pythonic notations, we'll have: + // castedranges = (ranges_i, slices_i, redranges_j, ranges_j, slices_j, redranges_i) + // with: + // - ranges_i = redranges_i = [ [0,M], [M,2M], ..., [(nbatches-1)M, nbatches*M] ] + // - slices_i = slices_j = [ 1, 2, ..., nbatches-1, nbatches ] + // - redranges_j = ranges_j = [ [0,N], [N,2N], ..., [(nbatches-1)N, nbatches*N] ] + + + //int ranges_i[2 * sizes.nbatches]; // ranges_i + ranges_i.resize(2 * sizes.nbatches, 0); + + //int slices_i[sizes.nbatches]; // slices_i + slices_i.resize(sizes.nbatches, 0); + + //int redranges_j[2 * sizes.nbatches]; // redranges_j + redranges_j.resize(2 * sizes.nbatches, 0); + + for (int b = 0; b < sizes.nbatches; b++) { + ranges_i[2 * b] = b * sizes.M; + ranges_i[2 * b + 1] = (b + 1) * sizes.M; + slices_i[b] = (b + 1); + redranges_j[2 * b] = b * sizes.N; + redranges_j[2 * b + 1] = (b + 1) * sizes.N; + } + + _castedranges[0] = &ranges_i[0]; + _castedranges[1] = &slices_i[0]; + _castedranges[2] = &redranges_j[0]; + _castedranges[3] = &redranges_j[0]; // ranges_j + _castedranges[4] = &slices_i[0]; // slices_j + _castedranges[5] = &ranges_i[0]; // redranges_i + + nranges_x = sizes.nbatches; + nredranges_x = sizes.nbatches; + nranges_y = sizes.nbatches; + nredranges_y = sizes.nbatches; + } + + castedranges = &_castedranges[0]; + +#if do_checks + else { + throw std::runtime_error( + "[KeOps] The 'ranges' argument (block-sparse mode) is not supported with batch processing, " + "but we detected " + std::to_string(sizes.nbatchdims) + " > 0 batch dimensions." + ); + } +#endif + + + }; + +}; diff --git a/keopscore/keopscore/include/Sizes_win.h b/keopscore/keopscore/include/Sizes_win.h new file mode 100644 index 000000000..c78f30e7c --- /dev/null +++ b/keopscore/keopscore/include/Sizes_win.h @@ -0,0 +1,402 @@ +#pragma once + +#include +#include +#include +#include + + +#define MIN(a, b) (((a)<(b))?(a):(b)) +#define MAX(a, b) (((a)<(b))?(b):(a)) +#define MAX3(a, b, c) (MAX(MAX(a,b),c)) + +#define do_checks 0 +#if do_checks +void error(std::string message) { + throw std::runtime_error(message); +} +#endif + + +#if C_CONTIGUOUS + +int get_val_batch(std::vector< int > _shape, int nbatch, int b) { + return _shape[b]; +} + +#else + +int get_val_batch(std::vector< int > _shape, int nbatch, int b) { + return _shape[nbatch - b]; +} + +#endif + +template< typename TYPE > +class Sizes { +public: + + // attributs + int nargs; + int nx, ny; + int M, N; + int nbatchdims; + int nbatches; + + std::vector< int > _shapes; + int *shapes; + std::vector< int > _shape_out; + int *shape_out; + + int tagIJ; + int use_half; + std::vector< int > indsI; + std::vector< int > indsJ; + std::vector< int > indsP; + int pos_first_argI; + int pos_first_argJ; + int dimout; + int nminargs; + int nvarsI; + int nvarsJ; + int nvarsP; + std::vector< int > dimsX; + std::vector< int > dimsY; + std::vector< int > dimsP; + + // constructors + Sizes(int _nargs, TYPE **args, const std::vector > &argshapes, int _nx, int _ny, + int tagIJ_, int use_half_, int dimout_, + const std::vector< int > &indsI_, std::vector< int > indsJ_, const std::vector< int > &indsP_, + const std::vector< int > &dimsX_, std::vector< int > dimsY_, const std::vector< int > &dimsP_) { + + tagIJ = tagIJ_; + use_half = use_half_; + indsI = indsI_; + indsJ = indsJ_; + indsP = indsP_; + dimout = dimout_; + + nvarsI = indsI.size(); + nvarsJ = indsJ.size(); + nvarsP = indsP.size(); + + pos_first_argI = (nvarsI > 0) ? *std::min_element(indsI.begin(), indsI.end()) : -1; + pos_first_argJ = (nvarsJ > 0) ? *std::min_element(indsJ.begin(), indsJ.end()) : -1; + + int max_i = (nvarsI > 0) ? *std::max_element(indsI.begin(), indsI.end()) : -1; + int max_j = (nvarsJ > 0) ? *std::max_element(indsJ.begin(), indsJ.end()) : -1; + int max_p = (nvarsP > 0) ? *std::max_element(indsP.begin(), indsP.end()) : -1; + + nminargs = 1 + MAX3(max_i, max_j, max_p); + dimsX = dimsX_; + dimsY = dimsY_; + dimsP = dimsP_; + nargs = _nargs; + nx = _nx; + ny = _ny; + + // fill shapes wit "batch dimensions" [A, .., B], the table will look like: + // + // [ A, .., B, M, N, D_out] -> output + // [ A, .., B, M, 1, D_1 ] -> "i" variable + // [ A, .., B, 1, N, D_2 ] -> "j" variable + // [ A, .., B, 1, 1, D_3 ] -> "parameter" + // [ A, .., 1, M, 1, D_4 ] -> N.B.: we support broadcasting on the batch dimensions! + // [ 1, .., 1, M, 1, D_5 ] -> (we'll just ask users to fill in the shapes with *explicit* ones) + fill_shape(nargs, argshapes); + + check_ranges(argshapes); + + // fill shape_out + _shape_out.resize(nbatchdims + 3); + +#if C_CONTIGUOUS + std::copy(_shapes.begin(), _shapes.begin() + nbatchdims + 3, _shape_out.begin());// Copy the "batch dimensions" + _shape_out.erase(_shape_out.begin() + nbatchdims + (1 - tagIJ)); + +#else + std::reverse_copy(_shapes.begin(), _shapes.begin() + nbatchdims + 3, + _shape_out.begin());// Copy the "batch dimensions" + _shape_out.erase(_shape_out.begin() + 1 + tagIJ); + +#endif + + // fill nx and ny + M = _shapes[nbatchdims]; // = M + N = _shapes[nbatchdims + 1]; // = N + + // Compute the product of all "batch dimensions" + nbatches = std::accumulate(_shapes.begin(), _shapes.begin() + nbatchdims, 1, std::multiplies< int >()); + + nx = nbatches * M; // = A * ... * B * M + ny = nbatches * N; // = A * ... * B * N + + shapes = &_shapes[0]; + shape_out = &_shape_out[0]; + } + + + // methods + + void switch_to_half2_indexing(); + +private: + void fill_shape(const int nargs, const std::vector > &argshapes); + + void check_ranges(const std::vector > &argshapes); + + int MN_pos, D_pos; +}; + + +template< typename TYPE > +void Sizes< TYPE >::fill_shape(const int nargs, const std::vector > &argshapes) { + + int pos = std::max(pos_first_argI, pos_first_argJ); + + if (pos > -1) { + // Are we working in batch mode? Infer the answer from the first arg ============= + nbatchdims = argshapes[pos].size() - 2; // number of batch dimensions = Number of dims of the first tensor - 2 + + if (nbatchdims < 0) { +#if do_checks + error("[KeOps] Wrong number of dimensions for arg at position 0: is " + + std::to_string(argshapes[0].size()) + " but should be at least 2." + ); +#endif + } + } else { + nbatchdims = 0; + } + +#if C_CONTIGUOUS + MN_pos = nbatchdims; + D_pos = nbatchdims + 1; +#else + D_pos = 0; + MN_pos = 1; +#endif + + // Now, we'll keep track of the output + all arguments' shapes in a large array: + _shapes.resize((nargs + 1) * (nbatchdims + 3), 1); + + if (use_half) { + if (tagIJ == 0) { + _shapes[nbatchdims] = nx % 2 ? nx + 1 : nx; + _shapes[nbatchdims + 1] = 2 * ny; + } else { + _shapes[nbatchdims] = 2 * nx; + _shapes[nbatchdims + 1] = ny % 2 ? ny + 1 : ny; + } + } else { + _shapes[nbatchdims] = nx; + _shapes[nbatchdims + 1] = ny; + } + + _shapes[nbatchdims + 2] = dimout; // Top right corner: dimension of the output + +} + +template< typename TYPE > +void Sizes< TYPE >::check_ranges(const std::vector > &argshapes) { + + // Check the compatibility of all tensor shapes ================================== + if (nminargs > 0) { + + // Checks args in all the positions that correspond to "i" variables: + for (int k = 0; k < nvarsI; k++) { + int i = indsI[k]; + + // Fill in the (i+1)-th line of the "shapes" array --------------------------- + int off_i = (i + 1) * (nbatchdims + 3); + + // Check the number of dimensions -------------------------------------------- + int ndims = argshapes[i].size(); // Number of dims of the i-th tensor + +#if do_checks + if (ndims != nbatchdims + 2) { + error("[KeOps] Wrong number of dimensions for arg at position " + std::to_string(i) + + " (i type): KeOps detected " + std::to_string(nbatchdims) + + " batch dimensions from the first argument 0, and thus expected " + + std::to_string(nbatchdims + 2) + + " dimensions here, but only received " + + std::to_string(ndims) + + ". Note that KeOps supports broadcasting on batch dimensions, " + + "but still expects 'dummy' unit dimensions in the input shapes, " + + "for the sake of clarity."); + } +#endif + + + + // First, the batch dimensions: + for (int b = 0; b < nbatchdims; b++) { + _shapes[off_i + b] = get_val_batch(argshapes[i], nbatchdims + 2, b); + + // Check that the current value is compatible with what + // we've encountered so far, as stored in the first line of "shapes" + if (_shapes[off_i + b] != 1) { // This dimension is not "broadcasted" + if (_shapes[b] == 1) { + _shapes[b] = _shapes[off_i + b]; // -> it becomes the new standard + } +#if do_checks + else if (_shapes[b] != _shapes[off_i + b]) { + error("[KeOps] Wrong value of the batch dimension " + + std::to_string(b) + " for argument number " + std::to_string(i) + + " : is " + std::to_string(_shapes[off_i + b]) + + " but was " + std::to_string(_shapes[b]) + + " or 1 in previous arguments."); + } +#endif + } + } + + _shapes[off_i + nbatchdims] = argshapes[i][MN_pos]; // = "M" + _shapes[off_i + nbatchdims + 2] = argshapes[i][D_pos]; // = "D" + + +#if do_checks + // Check the number of "lines": + if (_shapes[nbatchdims] != _shapes[off_i + nbatchdims]) { + error("[KeOps] Wrong value of the 'i' dimension " + + std::to_string(nbatchdims) + "for arg at position " + std::to_string(i) + + " : is " + std::to_string(_shapes[off_i + nbatchdims]) + + " but was " + std::to_string(_shapes[nbatchdims]) + + " in previous 'i' arguments."); + } + + // And the number of "columns": + if (_shapes[off_i + nbatchdims + 2] != static_cast< int >(dimsX[k])) { + error("[KeOps] Wrong value of the 'vector size' dimension " + + std::to_string(nbatchdims + 1) + " for arg at position " + std::to_string(i) + + " : is " + std::to_string(_shapes[off_i + nbatchdims + 2]) + + " but should be " + std::to_string(dimsX[k])); + } +#endif + } + + + // Checks args in all the positions that correspond to "j" variables: + for (int k = 0; k < nvarsJ; k++) { + int i = indsJ[k]; + + // Check the number of dimensions -------------------------------------------- + int ndims = argshapes[i].size(); // Number of dims of the i-th tensor + +#if do_checks + if (ndims != nbatchdims + 2) { + error("[KeOps] Wrong number of dimensions for arg at position " + std::to_string(i) + + " (j type): KeOps detected " + std::to_string(nbatchdims) + + " batch dimensions from the first argument 0, and thus expected " + + std::to_string(nbatchdims + 2) + + " dimensions here, but only received " + + std::to_string(ndims) + + ". Note that KeOps supports broadcasting on batch dimensions, " + + "but still expects 'dummy' unit dimensions in the input shapes, " + + "for the sake of clarity."); + } +#endif + + // Fill in the (i+1)-th line of the "shapes" array --------------------------- + int off_i = (i + 1) * (nbatchdims + 3); + + // First, the batch dimensions: + for (int b = 0; b < nbatchdims; b++) { + _shapes[off_i + b] = get_val_batch(argshapes[i], nbatchdims + 2, b); + + // Check that the current value is compatible with what + // we've encountered so far, as stored in the first line of "shapes" + if (_shapes[off_i + b] != 1) { // This dimension is not "broadcasted" + if (_shapes[b] == 1) { + _shapes[b] = _shapes[off_i + b]; // -> it becomes the new standard + } +#if do_checks + else if (_shapes[b] != _shapes[off_i + b]) { + error("[KeOps] Wrong value of the batch dimension " + + std::to_string(b) + " for argument number " + std::to_string(i) + + " : is " + std::to_string(_shapes[off_i + b]) + + " but was " + std::to_string(_shapes[b]) + + " or 1 in previous arguments."); + } +#endif + } + } + + _shapes[off_i + nbatchdims + 1] = argshapes[i][MN_pos]; // = "N" + _shapes[off_i + nbatchdims + 2] = argshapes[i][D_pos]; // = "D" + + +#if do_checks + // Check the number of "lines": + if (_shapes[nbatchdims + 1] != _shapes[off_i + nbatchdims + 1]) { + error("[KeOps] Wrong value of the 'j' dimension " + + std::to_string(nbatchdims) + " for arg at position " + std::to_string(i) + + " : is " + std::to_string(_shapes[off_i + nbatchdims + 1]) + + " but was " + std::to_string(_shapes[nbatchdims + 1]) + + " in previous 'j' arguments."); + } + + // And the number of "columns": + if (_shapes[off_i + nbatchdims + 2] != static_cast< int >(dimsY[k])) { + error("[KeOps] Wrong value of the 'vector size' dimension " + + std::to_string(nbatchdims + 1) + " for arg at position " + std::to_string(i) + + " : is " + std::to_string(_shapes[off_i + nbatchdims + 2]) + + " but should be " + std::to_string(dimsY[k])); + } +#endif + } + + + for (int k = 0; k < nvarsP; k++) { + int i = indsP[k]; + // Fill in the (i+1)-th line of the "shapes" array --------------------------- + int off_i = (i + 1) * (nbatchdims + 3); + // First, the batch dimensions: + for (int b = 0; b < nbatchdims; b++) { + _shapes[off_i + b] = get_val_batch(argshapes[i], nbatchdims + 2, b); + } + _shapes[off_i + nbatchdims + 2] = argshapes[i][nbatchdims]; // = "D" +#if do_checks + int dim_param; + if (use_half) + dim_param = _shapes[off_i + nbatchdims + 2] / 2; + else + dim_param = _shapes[off_i + nbatchdims + 2]; + if (dim_param != static_cast< int >(dimsP[k])) { + error("[KeOps] Wrong value of the 'vector size' dimension " + + std::to_string(nbatchdims) + " for arg at position " + std::to_string(i) + + " : is " + std::to_string(dim_param) + + " but should be " + std::to_string(dimsP[k])); + } +#endif + } + } + +} + +template< typename TYPE > +void Sizes< TYPE >::switch_to_half2_indexing() { + // special case of float16 inputs : because we use half2 type in Cuda codes, we need to divide by two nx, ny, and M, N, or D + // values inside the shapes vector. + nx = nx / 2; + ny = ny / 2; + M = M / 2; + N = N / 2; + _shapes[nbatchdims] = _shapes[nbatchdims] / 2; + _shapes[nbatchdims + 1] = _shapes[nbatchdims + 1] / 2; + for (int i = 0; i < nargs; i++) { + int off_i = (i + 1) * (nbatchdims + 3); + // we don't have anymore the category information... + // the last three dimensions are either of the form (M,1,D), (1,N,D), or (1,1,D) + // where M or N are even in the 2 first cases, or D is even in the third case. + if (_shapes[off_i + nbatchdims] > 1) + _shapes[off_i + nbatchdims] = _shapes[off_i + nbatchdims] / 2; + else if (_shapes[off_i + nbatchdims + 1] > 1) + _shapes[off_i + nbatchdims + 1] = _shapes[off_i + nbatchdims + 1] / 2; + else + _shapes[off_i + nbatchdims + 2] = _shapes[off_i + nbatchdims + 2] / 2; + } +} + diff --git a/keopscore/keopscore/include/ranges_utils_win.h b/keopscore/keopscore/include/ranges_utils_win.h new file mode 100644 index 000000000..372b61c06 --- /dev/null +++ b/keopscore/keopscore/include/ranges_utils_win.h @@ -0,0 +1,82 @@ +#pragma once + + +int broadcast_index(int i, int nbatchdims, int *full_shape, int *shape) { + int M_N = shape[nbatchdims]; + int res = i % M_N, step = M_N, full_step = M_N; + for (int b = nbatchdims; b > 0; b--) { + if (shape[b - 1] != 1) { + res += ((i / full_step) % shape[b - 1]) * step; + } + full_step *= full_shape[b - 1]; + step *= shape[b - 1]; + } + return res; +} + +void vect_broadcast_index(int i, int nbatchdims, int nvars, int *full_shape, + int *reduced_shapes, int *out, int add_offset = 0) { + for (int k = 0; k < nvars; k++) { + out[k] = add_offset + broadcast_index(i, nbatchdims, full_shape, reduced_shapes + (nbatchdims + 1) * k); + } +} + +void fill_shapes(int nbatchdims, int *shapes, int *shapes_i, int *shapes_j, int *shapes_p, + int tagJ, + const std::vector< int > &indsi, + const std::vector< int > &indsj, + const std::vector< int > &indsp) { + + int sizei = indsi.size(); + int sizej = indsj.size(); + int sizep = indsp.size(); + + sizei += 1; + + const int tagIJ = tagJ; // 1 if the reduction is made "over j", 0 if it is made "over i" + + // Separate and store the shapes of the "i" and "j" variables + parameters -------------- + // + // N.B.: If tagIJ == 1, the reduction is made over 'j', which is the default mode. + // However, if tagIJ == 0, the reduction is performed over the 'i' variables: + // since "shape" does not change, we must adapt the adress at which we pick information... + // + // shapes is an array of size (1+nargs)*(nbatchdims+3), which looks like: + // [ A, .., B, M, N, D_out] -> output + // [ A, .., B, M, 1, D_1 ] -> "i" variable + // [ A, .., B, 1, N, D_2 ] -> "j" variable + // [ A, .., B, 1, 1, D_3 ] -> "parameter" + // [ A, .., 1, M, 1, D_4 ] -> N.B.: we support broadcasting on the batch dimensions! + // [ 1, .., 1, M, 1, D_5 ] -> (we'll just ask users to fill in the shapes with *explicit* ones) + + // First, we fill shapes_i with the "relevant" shapes of the "i" variables, + // making it look like, say: + // [ A, .., B, M] + // [ A, .., 1, M] + // [ A, .., A, M] + for (int k = 0; k < (sizei - 1); k++) { // k-th line + for (int l = 0; l < nbatchdims; l++) { // l-th column + shapes_i[k * (nbatchdims + 1) + l] = shapes[(1 + indsi[k]) * (nbatchdims + 3) + l]; + } + shapes_i[k * (nbatchdims + 1) + nbatchdims] = + shapes[(1 + indsi[k]) * (nbatchdims + 3) + nbatchdims + 1 - tagIJ]; + } + + // Then, we do the same for shapes_j, but with "N" instead of "M": + for (int k = 0; k < sizej; k++) { // k-th line + for (int l = 0; l < nbatchdims; l++) { // l-th column + shapes_j[k * (nbatchdims + 1) + l] = shapes[(1 + indsj[k]) * (nbatchdims + 3) + l]; + } + shapes_j[k * (nbatchdims + 1) + nbatchdims] = shapes[(1 + indsj[k]) * (nbatchdims + 3) + nbatchdims + + tagIJ]; + } + + // And finally for the parameters, with "1" instead of "M": + for (int k = 0; k < sizep; k++) { // k-th line + for (int l = 0; l < nbatchdims; l++) { // l-th column + shapes_p[k * (nbatchdims + 1) + l] = shapes[(1 + indsp[k]) * (nbatchdims + 3) + l]; + } + shapes_p[k * (nbatchdims + 1) + nbatchdims] = 1; + } + +} \ No newline at end of file diff --git a/keopscore/keopscore/include/utils_pe_win.h b/keopscore/keopscore/include/utils_pe_win.h new file mode 100644 index 000000000..6e172e476 --- /dev/null +++ b/keopscore/keopscore/include/utils_pe_win.h @@ -0,0 +1,100 @@ +#include +#include + +#define NVRTC_SAFE_CALL(x) \ + do { \ + nvrtcResult result = x; \ + if (result != NVRTC_SUCCESS) { \ + std::cerr << "\nerror: " #x " failed with error " \ + << nvrtcGetErrorString(result) << '\n' << '\n'; \ + throw std::runtime_error("[KeOps] NVRTC error."); \ + } \ + } while(0) + +#define CUDA_SAFE_CALL_NO_EXCEPTION(x) \ + do { \ + CUresult result = x; \ + if (result != CUDA_SUCCESS) { \ + const char *msg; \ + cuGetErrorName(result, &msg); \ + std::cerr << "\n[KeOps] error: " #x " failed with error " \ + << msg << '\n' << '\n'; \ + exit(1); \ + } \ + } while(0) + +#define CUDA_SAFE_CALL(x) \ + do { \ + CUresult result = x; \ + if (result != CUDA_SUCCESS) { \ + const char *msg; \ + cuGetErrorName(result, &msg); \ + std::cerr << "\n[KeOps] error: " #x " failed with error " \ + << msg << '\n' << '\n'; \ + throw std::runtime_error("[KeOps] Cuda error."); \ + } \ + } while(0) + + +char *read_text_file(char const *path) { + char *buffer = 0; + long length; + FILE *f = fopen(path, "rb"); + if (f) { + fseek(f, 0, SEEK_END); + length = ftell(f); + fseek(f, 0, SEEK_SET); + buffer = (char *) malloc((length + 1) * sizeof(char)); + if (buffer) { + int res = fread(buffer, sizeof(char), length, f); + } + fclose(f); + } + buffer[length] = '\0'; + return buffer; +} + + + +template +void load_args_FromDevice(CUdeviceptr &p_data, TYPE *out, TYPE *&out_d, int nargs, TYPE **arg, TYPE **&arg_d) { + CUDA_SAFE_CALL(cuMemAlloc(&p_data, sizeof(TYPE *) * nargs)); + out_d = out; + arg_d = (TYPE **) p_data; + // copy array of pointers + CUDA_SAFE_CALL(cuMemcpyHtoD((CUdeviceptr) arg_d, arg, nargs * sizeof(TYPE *))); +} + + +template +void +load_args_FromHost(CUdeviceptr &p_data, TYPE *out, TYPE *&out_d, int nargs, + TYPE **arg, TYPE **&arg_d, + const std::vector< std::vector< int > > &argshape, + int sizeout) { + std::vector sizes(nargs); + int totsize = sizeout; + for (int k = 0; k < nargs; k++) { + sizes[k] = std::accumulate(argshape[k].begin(), argshape[k].end(), 1, std::multiplies< int >()); + totsize += sizes[k]; + } + + CUDA_SAFE_CALL(cuMemAlloc(&p_data, sizeof(TYPE *) * nargs + sizeof(TYPE) * totsize)); + + arg_d = (TYPE **) p_data; + TYPE *dataloc = (TYPE *) (arg_d + nargs); + + // host array of pointers to device data + std::vector ph(nargs); + + out_d = dataloc; + dataloc += sizeout; + for (int k = 0; k < nargs; k++) { + ph[k] = dataloc; + CUDA_SAFE_CALL(cuMemcpyHtoD((CUdeviceptr) dataloc, arg[k], sizeof(TYPE) * sizes[k])); + dataloc += sizes[k]; + } + + // copy array of pointers + CUDA_SAFE_CALL(cuMemcpyHtoD((CUdeviceptr) arg_d, ph.data(), nargs * sizeof(TYPE *))); +} diff --git a/pykeops/pykeops/common/keops_io/LoadKeOps_nvrtc.py b/pykeops/pykeops/common/keops_io/LoadKeOps_nvrtc.py index 95ef25489..71e3d621f 100644 --- a/pykeops/pykeops/common/keops_io/LoadKeOps_nvrtc.py +++ b/pykeops/pykeops/common/keops_io/LoadKeOps_nvrtc.py @@ -26,6 +26,7 @@ def init_phase2(self): pykeops_nvrtc = importlib.import_module("pykeops_nvrtc") if self.params.c_dtype == "float": + breakpoint() self.launch_keops = pykeops_nvrtc.KeOps_module_float( self.params.device_id_request, self.params.nargs, @@ -85,7 +86,12 @@ def compile_jit_binary(): dllname=pykeops.config.pykeops_nvrtc_name(type="target"), ) pyKeOps_Message("Compiling nvrtc binder for python ... ", flush=True, end="") - KeOps_OS_Run(compile_command) + if os.name =="nt": + from keopscore.windows_compilations import compile_pykeops_nvrtc + compile_pykeops_nvrtc(build_folder=get_build_folder()) + + else: + KeOps_OS_Run(compile_command) pyKeOps_Message("OK", use_tag=False, flush=True) From 3631b9e149b906fa495afa72938042d8c3275662 Mon Sep 17 00:00:00 2001 From: Louis Pujol <52413616+Louis-Pujol@users.noreply.github.com> Date: Fri, 11 Jul 2025 10:23:40 +0200 Subject: [PATCH 08/25] lint --- keopscore/keopscore/binders/LinkCompile.py | 2 +- .../keopscore/binders/nvrtc/Gpu_link_compile.py | 5 +---- keopscore/keopscore/config/__init__.py | 1 + keopscore/keopscore/config/cuda_windows.py | 13 +++++-------- pykeops/pykeops/common/keops_io/LoadKeOps_cpp.py | 8 ++++---- pykeops/pykeops/common/keops_io/LoadKeOps_nvrtc.py | 5 +++-- 6 files changed, 15 insertions(+), 19 deletions(-) diff --git a/keopscore/keopscore/binders/LinkCompile.py b/keopscore/keopscore/binders/LinkCompile.py index e561745ce..d1ba38058 100644 --- a/keopscore/keopscore/binders/LinkCompile.py +++ b/keopscore/keopscore/binders/LinkCompile.py @@ -88,7 +88,7 @@ def write_code(self): f.write(self.code.replace("signed long int", "int")) else: f.write(self.code) - + f.close() def generate_code(self): diff --git a/keopscore/keopscore/binders/nvrtc/Gpu_link_compile.py b/keopscore/keopscore/binders/nvrtc/Gpu_link_compile.py index e0b90774f..9617d94e0 100644 --- a/keopscore/keopscore/binders/nvrtc/Gpu_link_compile.py +++ b/keopscore/keopscore/binders/nvrtc/Gpu_link_compile.py @@ -34,10 +34,7 @@ def jit_compile_dll(): if os.name == "nt": - return os.path.join( - build_folder, - "nvrtc_jit.dll" - ) + return os.path.join(build_folder, "nvrtc_jit.dll") else: return os.path.join( build_folder, diff --git a/keopscore/keopscore/config/__init__.py b/keopscore/keopscore/config/__init__.py index 314189442..a9c176ebd 100644 --- a/keopscore/keopscore/config/__init__.py +++ b/keopscore/keopscore/config/__init__.py @@ -1,6 +1,7 @@ # Import the configuration classes from .base_config import Config import os + if os.name != "nt": from .cuda import CUDAConfig else: diff --git a/keopscore/keopscore/config/cuda_windows.py b/keopscore/keopscore/config/cuda_windows.py index 5431f8b9c..6d73f5c3a 100644 --- a/keopscore/keopscore/config/cuda_windows.py +++ b/keopscore/keopscore/config/cuda_windows.py @@ -29,7 +29,6 @@ from ..windows_compilations import cuda_detection - detection = cuda_detection.detect_cuda_toolkit() # cuda_lib = detection['lib_dirs'] # cuda_include = detection['include_dir'] @@ -53,7 +52,7 @@ def set_use_cuda(self): self._use_cuda = cuda_detection.cuda_available if not self._cuda_libraries_available(): self._use_cuda = False - + self.get_cuda_version() self.get_cuda_include_path() self.get_gpu_props() @@ -71,8 +70,7 @@ def _cuda_libraries_available(self): This is also where we handle one single warning if needed. """ - return 'dll_nvrtc' in detection and 'dll_cuda' in detection - + return "dll_nvrtc" in detection and "dll_cuda" in detection def get_cuda_version(self, out_type="single_value"): @@ -81,7 +79,7 @@ def get_cuda_version(self, out_type="single_value"): return None try: - libcudart = ctypes.CDLL(detection['dll_cudart']) + libcudart = ctypes.CDLL(detection["dll_cudart"]) cuda_version = ctypes.c_int() libcudart.cudaRuntimeGetVersion(ctypes.byref(cuda_version)) cuda_version_value = int(cuda_version.value) @@ -101,8 +99,7 @@ def get_cuda_version(self, out_type="single_value"): KeOps_Warning(f"Could not determine CUDA version: {e}") self.cuda_version = None return None - - + def get_gpu_props(self): """ Getting GPU properties and related attributes. @@ -114,7 +111,7 @@ def get_gpu_props(self): return (self.n_gpus, self.gpu_compile_flags) # Attempt to load the CUDA driver library - libcuda_path = detection['dll_cuda'] + libcuda_path = detection["dll_cuda"] # We have a handle, let's proceed libcuda = ctypes.CDLL(libcuda_path) diff --git a/pykeops/pykeops/common/keops_io/LoadKeOps_cpp.py b/pykeops/pykeops/common/keops_io/LoadKeOps_cpp.py index 17e0b26fb..23c9978fb 100644 --- a/pykeops/pykeops/common/keops_io/LoadKeOps_cpp.py +++ b/pykeops/pykeops/common/keops_io/LoadKeOps_cpp.py @@ -12,7 +12,6 @@ from pykeops.config import pykeops_cpp_name, python_includes - class LoadKeOps_cpp_class(LoadKeOps): def __init__(self, *args, fast_init=False): super().__init__(*args, fast_init=fast_init) @@ -191,7 +190,7 @@ def get_pybind11_code(self): m.def("launch_pykeops_cpu", &launch_pykeops_{self.params.tag}_cpu < {cpp_dtype[self.params.dtype]} >, "Entry point to keops."); }} """ - + else: return f""" #include "{self.params.source_name}" @@ -302,8 +301,9 @@ def get_pybind11_code(self): m.doc() = "pyKeOps: KeOps for pytorch through pybind11 (pytorch flavour)."; m.def("launch_pykeops_cpu", &launch_pykeops_{self.params.tag}_cpu < {cpp_dtype[self.params.dtype]} >, "Entry point to keops."); }} - """.replace("long", "int64_t") - + """.replace( + "long", "int64_t" + ) LoadKeOps_cpp = Cache_partial( diff --git a/pykeops/pykeops/common/keops_io/LoadKeOps_nvrtc.py b/pykeops/pykeops/common/keops_io/LoadKeOps_nvrtc.py index 71e3d621f..755896cf2 100644 --- a/pykeops/pykeops/common/keops_io/LoadKeOps_nvrtc.py +++ b/pykeops/pykeops/common/keops_io/LoadKeOps_nvrtc.py @@ -86,10 +86,11 @@ def compile_jit_binary(): dllname=pykeops.config.pykeops_nvrtc_name(type="target"), ) pyKeOps_Message("Compiling nvrtc binder for python ... ", flush=True, end="") - if os.name =="nt": + if os.name == "nt": from keopscore.windows_compilations import compile_pykeops_nvrtc + compile_pykeops_nvrtc(build_folder=get_build_folder()) - + else: KeOps_OS_Run(compile_command) pyKeOps_Message("OK", use_tag=False, flush=True) From c2027193f801b9aed8d231713d45e045033e0d21 Mon Sep 17 00:00:00 2001 From: Louis Pujol <52413616+Louis-Pujol@users.noreply.github.com> Date: Fri, 11 Jul 2025 11:34:46 +0200 Subject: [PATCH 09/25] typo in pybind11 code --- pykeops/pykeops/common/keops_io/LoadKeOps_cpp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pykeops/pykeops/common/keops_io/LoadKeOps_cpp.py b/pykeops/pykeops/common/keops_io/LoadKeOps_cpp.py index 23c9978fb..238b50314 100644 --- a/pykeops/pykeops/common/keops_io/LoadKeOps_cpp.py +++ b/pykeops/pykeops/common/keops_io/LoadKeOps_cpp.py @@ -162,7 +162,7 @@ def get_pybind11_code(self): }} - return launch_keops_{self.params.tag}_cpu< TYPE >(dimY, + return launch_keops_cpu_{self.params.tag} < TYPE >(dimY, nx, ny, tagI, From f13f62b8dc287452122a95b7885a0d67c6f8d072 Mon Sep 17 00:00:00 2001 From: Louis Pujol <52413616+Louis-Pujol@users.noreply.github.com> Date: Fri, 11 Jul 2025 11:53:39 +0200 Subject: [PATCH 10/25] add `windows_compilation` package to `setup.py` --- keopscore/setup.py | 1 + pykeops/pykeops/common/keops_io/LoadKeOps_cpp.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/keopscore/setup.py b/keopscore/setup.py index a0854b4a2..dcbae7d02 100644 --- a/keopscore/setup.py +++ b/keopscore/setup.py @@ -59,6 +59,7 @@ "keopscore.mapreduce.cpu", "keopscore.mapreduce.gpu", "keopscore.utils", + "keopscore.windows_compilations", ], package_data={ "keopscore": [ diff --git a/pykeops/pykeops/common/keops_io/LoadKeOps_cpp.py b/pykeops/pykeops/common/keops_io/LoadKeOps_cpp.py index 238b50314..19d7aaa81 100644 --- a/pykeops/pykeops/common/keops_io/LoadKeOps_cpp.py +++ b/pykeops/pykeops/common/keops_io/LoadKeOps_cpp.py @@ -11,6 +11,8 @@ from keopscore.utils.misc_utils import KeOps_OS_Run from pykeops.config import pykeops_cpp_name, python_includes +#TODO limit code duplication for get_pybind11_code() + class LoadKeOps_cpp_class(LoadKeOps): def __init__(self, *args, fast_init=False): From 0ae6e891d3eb91cc3583d6b846c75179af8a768d Mon Sep 17 00:00:00 2001 From: Louis Pujol <52413616+Louis-Pujol@users.noreply.github.com> Date: Fri, 11 Jul 2025 16:11:47 +0200 Subject: [PATCH 11/25] update setup.pys (ugly fix for version on windows) --- keopscore/setup.py | 16 ++++++++++++++++ pykeops/setup.py | 9 +++++++++ 2 files changed, 25 insertions(+) diff --git a/keopscore/setup.py b/keopscore/setup.py index dcbae7d02..b961f0c40 100644 --- a/keopscore/setup.py +++ b/keopscore/setup.py @@ -11,6 +11,14 @@ with open(os.path.join(here, "keopscore", "keops_version"), encoding="utf-8") as v: current_version = v.read().rstrip() +#TODO fix this (issues with symlinks on windows ? -> moving to pyproject.toml ?) +if os.name == "nt": + with open(os.path.join(here, "..", "keops_version"), encoding="utf-8") as v: + current_version = v.read().rstrip() + # copy the content to keopscore/keops_version + with open(os.path.join(here, "keopscore", "keops_version"), "w", encoding="utf-8") as v: + v.write(current_version) + # Get the long description from the README file with open(path.join(here, "keopscore", "readme.md"), encoding="utf-8") as f: long_description = f.read() @@ -74,6 +82,14 @@ "include/Ranges.h", "include/Sizes.h", "include/utils_pe.h", + "binders/nvrtc/keops_nvrtc_win.cpp", + "binders/nvrtc/nvrtc_jit_win.cpp", + "include/CudaSizes_win.h", + "include/ranges_utils_win.h", + "include/Ranges_win.h", + "include/Sizes_win.h", + "include/utils_pe_win.h", + "windows_compilations/templates/CMakeLists.txt", ], }, install_requires=[], diff --git a/pykeops/setup.py b/pykeops/setup.py index d8154d91f..f32a9a699 100644 --- a/pykeops/setup.py +++ b/pykeops/setup.py @@ -12,6 +12,14 @@ with open(os.path.join(here, "pykeops", "keops_version"), encoding="utf-8") as v: current_version = v.read().rstrip() +#TODO fix this (issues with symlinks on windows ? -> moving to pyproject.toml ?) +if os.name == "nt": + with open(os.path.join(here, "..", "keops_version"), encoding="utf-8") as v: + current_version = v.read().rstrip() + # copy the content to pykeops/keops_version + with open(os.path.join(here, "pykeops", "keops_version"), "w", encoding="utf-8") as v: + v.write(current_version) + # Get the long description from the README file with open(path.join(here, "pykeops", "readme.md"), encoding="utf-8") as f: long_description = f.read() @@ -62,6 +70,7 @@ "licence.txt", "keops_version", "common/keops_io/pykeops_nvrtc.cpp", + "common/keops_io/pykeops_nvrtc_win.cpp", ], }, install_requires=["numpy", "pybind11", "keopscore"], From 28884242e8d6091cc76873705240fc3615b1e23b Mon Sep 17 00:00:00 2001 From: Louis Pujol <52413616+Louis-Pujol@users.noreply.github.com> Date: Fri, 11 Jul 2025 16:14:45 +0200 Subject: [PATCH 12/25] remove breakpoints --- keopscore/keopscore/binders/nvrtc/Gpu_link_compile.py | 1 - pykeops/pykeops/common/keops_io/LoadKeOps_nvrtc.py | 1 - 2 files changed, 2 deletions(-) diff --git a/keopscore/keopscore/binders/nvrtc/Gpu_link_compile.py b/keopscore/keopscore/binders/nvrtc/Gpu_link_compile.py index 9617d94e0..4833cec95 100644 --- a/keopscore/keopscore/binders/nvrtc/Gpu_link_compile.py +++ b/keopscore/keopscore/binders/nvrtc/Gpu_link_compile.py @@ -80,7 +80,6 @@ def generate_code(self): self.write_code() # we execute the main dll, passing the code as argument, and the name of the low level code file to save the assembly instructions - breakpoint() if os.name != "nt": res = self.my_c_dll.Compile( create_string_buffer(self.low_level_code_file), diff --git a/pykeops/pykeops/common/keops_io/LoadKeOps_nvrtc.py b/pykeops/pykeops/common/keops_io/LoadKeOps_nvrtc.py index 755896cf2..13e6cdc8e 100644 --- a/pykeops/pykeops/common/keops_io/LoadKeOps_nvrtc.py +++ b/pykeops/pykeops/common/keops_io/LoadKeOps_nvrtc.py @@ -26,7 +26,6 @@ def init_phase2(self): pykeops_nvrtc = importlib.import_module("pykeops_nvrtc") if self.params.c_dtype == "float": - breakpoint() self.launch_keops = pykeops_nvrtc.KeOps_module_float( self.params.device_id_request, self.params.nargs, From 830deb76e8ff154c87aeeefca66f8dbe94699a9b Mon Sep 17 00:00:00 2001 From: Louis Pujol <52413616+Louis-Pujol@users.noreply.github.com> Date: Fri, 11 Jul 2025 16:32:05 +0200 Subject: [PATCH 13/25] cuda detection: set use cuda to false if CUDA_PATH is not accessible --- keopscore/keopscore/config/cuda_windows.py | 6 ++++++ keopscore/keopscore/windows_compilations/cuda_detection.py | 7 +++---- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/keopscore/keopscore/config/cuda_windows.py b/keopscore/keopscore/config/cuda_windows.py index 6d73f5c3a..8dbaa554c 100644 --- a/keopscore/keopscore/config/cuda_windows.py +++ b/keopscore/keopscore/config/cuda_windows.py @@ -50,6 +50,12 @@ class CUDAConfigWin(CUDAConfig): def set_use_cuda(self): """Determine and set whether to use CUDA.""" self._use_cuda = cuda_detection.cuda_available + + if not self._use_cuda: + self.cuda_message = "CUDA libraries not detected; Switching to CPU only." + KeOps_Warning(self.cuda_message) + + # Check if both cuda and nvrtc libraries are available if not self._cuda_libraries_available(): self._use_cuda = False diff --git a/keopscore/keopscore/windows_compilations/cuda_detection.py b/keopscore/keopscore/windows_compilations/cuda_detection.py index 43513c95d..63e8ad2be 100644 --- a/keopscore/keopscore/windows_compilations/cuda_detection.py +++ b/keopscore/keopscore/windows_compilations/cuda_detection.py @@ -2,14 +2,13 @@ from ctypes.util import find_library from pathlib import Path -cuda_available = find_library("nvcuda") is not None - +cuda_available = "CUDA_PATH" in os.environ def detect_cuda_toolkit(): - if os.environ["CUDA_PATH"]: + output = {} - output = {} + if cuda_available: cuda_path = Path( os.environ["CUDA_PATH"] From 5a51e698bf70720819dfe7b36953c87dd7ae238f Mon Sep 17 00:00:00 2001 From: Louis Pujol <52413616+Louis-Pujol@users.noreply.github.com> Date: Fri, 11 Jul 2025 16:32:32 +0200 Subject: [PATCH 14/25] lint --- keopscore/keopscore/config/cuda_windows.py | 4 ++-- keopscore/keopscore/windows_compilations/cuda_detection.py | 1 + keopscore/setup.py | 6 ++++-- pykeops/pykeops/common/keops_io/LoadKeOps_cpp.py | 2 +- pykeops/setup.py | 6 ++++-- 5 files changed, 12 insertions(+), 7 deletions(-) diff --git a/keopscore/keopscore/config/cuda_windows.py b/keopscore/keopscore/config/cuda_windows.py index 8dbaa554c..21d5d6f4e 100644 --- a/keopscore/keopscore/config/cuda_windows.py +++ b/keopscore/keopscore/config/cuda_windows.py @@ -50,11 +50,11 @@ class CUDAConfigWin(CUDAConfig): def set_use_cuda(self): """Determine and set whether to use CUDA.""" self._use_cuda = cuda_detection.cuda_available - + if not self._use_cuda: self.cuda_message = "CUDA libraries not detected; Switching to CPU only." KeOps_Warning(self.cuda_message) - + # Check if both cuda and nvrtc libraries are available if not self._cuda_libraries_available(): self._use_cuda = False diff --git a/keopscore/keopscore/windows_compilations/cuda_detection.py b/keopscore/keopscore/windows_compilations/cuda_detection.py index 63e8ad2be..c6de764c3 100644 --- a/keopscore/keopscore/windows_compilations/cuda_detection.py +++ b/keopscore/keopscore/windows_compilations/cuda_detection.py @@ -4,6 +4,7 @@ cuda_available = "CUDA_PATH" in os.environ + def detect_cuda_toolkit(): output = {} diff --git a/keopscore/setup.py b/keopscore/setup.py index b961f0c40..a68664515 100644 --- a/keopscore/setup.py +++ b/keopscore/setup.py @@ -11,12 +11,14 @@ with open(os.path.join(here, "keopscore", "keops_version"), encoding="utf-8") as v: current_version = v.read().rstrip() -#TODO fix this (issues with symlinks on windows ? -> moving to pyproject.toml ?) +# TODO fix this (issues with symlinks on windows ? -> moving to pyproject.toml ?) if os.name == "nt": with open(os.path.join(here, "..", "keops_version"), encoding="utf-8") as v: current_version = v.read().rstrip() # copy the content to keopscore/keops_version - with open(os.path.join(here, "keopscore", "keops_version"), "w", encoding="utf-8") as v: + with open( + os.path.join(here, "keopscore", "keops_version"), "w", encoding="utf-8" + ) as v: v.write(current_version) # Get the long description from the README file diff --git a/pykeops/pykeops/common/keops_io/LoadKeOps_cpp.py b/pykeops/pykeops/common/keops_io/LoadKeOps_cpp.py index 19d7aaa81..961568023 100644 --- a/pykeops/pykeops/common/keops_io/LoadKeOps_cpp.py +++ b/pykeops/pykeops/common/keops_io/LoadKeOps_cpp.py @@ -11,7 +11,7 @@ from keopscore.utils.misc_utils import KeOps_OS_Run from pykeops.config import pykeops_cpp_name, python_includes -#TODO limit code duplication for get_pybind11_code() +# TODO limit code duplication for get_pybind11_code() class LoadKeOps_cpp_class(LoadKeOps): diff --git a/pykeops/setup.py b/pykeops/setup.py index f32a9a699..9dc2c2fd1 100644 --- a/pykeops/setup.py +++ b/pykeops/setup.py @@ -12,12 +12,14 @@ with open(os.path.join(here, "pykeops", "keops_version"), encoding="utf-8") as v: current_version = v.read().rstrip() -#TODO fix this (issues with symlinks on windows ? -> moving to pyproject.toml ?) +# TODO fix this (issues with symlinks on windows ? -> moving to pyproject.toml ?) if os.name == "nt": with open(os.path.join(here, "..", "keops_version"), encoding="utf-8") as v: current_version = v.read().rstrip() # copy the content to pykeops/keops_version - with open(os.path.join(here, "pykeops", "keops_version"), "w", encoding="utf-8") as v: + with open( + os.path.join(here, "pykeops", "keops_version"), "w", encoding="utf-8" + ) as v: v.write(current_version) # Get the long description from the README file From 52f9b7eebb0bce8debf438b3a06e9518b155f1d5 Mon Sep 17 00:00:00 2001 From: Louis Pujol <52413616+Louis-Pujol@users.noreply.github.com> Date: Fri, 11 Jul 2025 17:01:04 +0200 Subject: [PATCH 15/25] fix cuda detection --- keopscore/keopscore/config/__init__.py | 2 ++ keopscore/keopscore/config/cuda_windows.py | 3 +++ .../windows_compilations/detection.py | 24 +++++++++---------- 3 files changed, 17 insertions(+), 12 deletions(-) diff --git a/keopscore/keopscore/config/__init__.py b/keopscore/keopscore/config/__init__.py index a9c176ebd..11fa028ce 100644 --- a/keopscore/keopscore/config/__init__.py +++ b/keopscore/keopscore/config/__init__.py @@ -6,6 +6,8 @@ from .cuda import CUDAConfig else: from .cuda_windows import CUDAConfigWin as CUDAConfig + +#TODO openmp and c++ compiler detection for windows from .openmp import OpenMPConfig from .Platform import DetectPlatform diff --git a/keopscore/keopscore/config/cuda_windows.py b/keopscore/keopscore/config/cuda_windows.py index 21d5d6f4e..e23c4240b 100644 --- a/keopscore/keopscore/config/cuda_windows.py +++ b/keopscore/keopscore/config/cuda_windows.py @@ -29,6 +29,8 @@ from ..windows_compilations import cuda_detection +cuda_available = cuda_detection.cuda_available + detection = cuda_detection.detect_cuda_toolkit() # cuda_lib = detection['lib_dirs'] # cuda_include = detection['include_dir'] @@ -52,6 +54,7 @@ def set_use_cuda(self): self._use_cuda = cuda_detection.cuda_available if not self._use_cuda: + print("fzefgrrtg") self.cuda_message = "CUDA libraries not detected; Switching to CPU only." KeOps_Warning(self.cuda_message) diff --git a/keopscore/keopscore/windows_compilations/detection.py b/keopscore/keopscore/windows_compilations/detection.py index 57548129b..059d3c9fe 100644 --- a/keopscore/keopscore/windows_compilations/detection.py +++ b/keopscore/keopscore/windows_compilations/detection.py @@ -4,7 +4,7 @@ import pybind11 -from .cuda_detection import detect_cuda_toolkit +from .cuda_detection import detect_cuda_toolkit, cuda_available from .utils import find_package_location include_dirs = {} @@ -45,18 +45,18 @@ # Cuda -cuda_config = detect_cuda_toolkit() +if cuda_available: + cuda_config = detect_cuda_toolkit() + for key in ["cuda", "nvrtc", "cudart"]: -for key in ["cuda", "nvrtc", "cudart"]: + if f"dll_{key}" in cuda_config: + dlls[key] = cuda_config[f"dll_{key}"] - if f"dll_{key}" in cuda_config: - dlls[key] = cuda_config[f"dll_{key}"] + if key in cuda_config["lib_names"]: + lib_names[key] = cuda_config["lib_names"][key] - if key in cuda_config["lib_names"]: - lib_names[key] = cuda_config["lib_names"][key] + if "include_dir" in cuda_config: + include_dirs["cuda"] = cuda_config["include_dir"] -if "include_dir" in cuda_config: - include_dirs["cuda"] = cuda_config["include_dir"] - -if "lib_dirs" in cuda_config: - lib_dirs["cuda"] = cuda_config["lib_dirs"] + if "lib_dirs" in cuda_config: + lib_dirs["cuda"] = cuda_config["lib_dirs"] From e1f304c96b5d8b470db5245371c9923203c6fd76 Mon Sep 17 00:00:00 2001 From: Louis Pujol <52413616+Louis-Pujol@users.noreply.github.com> Date: Fri, 11 Jul 2025 17:01:43 +0200 Subject: [PATCH 16/25] lint --- keopscore/keopscore/config/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/keopscore/keopscore/config/__init__.py b/keopscore/keopscore/config/__init__.py index 11fa028ce..839b05793 100644 --- a/keopscore/keopscore/config/__init__.py +++ b/keopscore/keopscore/config/__init__.py @@ -7,7 +7,7 @@ else: from .cuda_windows import CUDAConfigWin as CUDAConfig -#TODO openmp and c++ compiler detection for windows +# TODO openmp and c++ compiler detection for windows from .openmp import OpenMPConfig from .Platform import DetectPlatform From 0222d16f3162dd1b705248ef161a443a95f5013f Mon Sep 17 00:00:00 2001 From: Louis Pujol <52413616+Louis-Pujol@users.noreply.github.com> Date: Fri, 11 Jul 2025 19:45:28 +0200 Subject: [PATCH 17/25] fix argument types in call_keops for windows --- pykeops/pykeops/common/keops_io/LoadKeOps_nvrtc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pykeops/pykeops/common/keops_io/LoadKeOps_nvrtc.py b/pykeops/pykeops/common/keops_io/LoadKeOps_nvrtc.py index 13e6cdc8e..f46aed6c1 100644 --- a/pykeops/pykeops/common/keops_io/LoadKeOps_nvrtc.py +++ b/pykeops/pykeops/common/keops_io/LoadKeOps_nvrtc.py @@ -66,7 +66,7 @@ def call_keops(self, nx, ny): self.params.dimsp, self.ranges_ptr_new, self.outshape, - self.out_ptr, + self.out_ptr if os.name != "nt" else (self.out_ptr, ), self.args_ptr_new, self.argshapes_new, ) From 593b0594b3b35160d4be14389eb4b41cd38912ba Mon Sep 17 00:00:00 2001 From: Louis Pujol <52413616+Louis-Pujol@users.noreply.github.com> Date: Fri, 11 Jul 2025 20:18:58 +0200 Subject: [PATCH 18/25] change type annotations for paths --- keopscore/keopscore/windows_compilations/compile.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/keopscore/keopscore/windows_compilations/compile.py b/keopscore/keopscore/windows_compilations/compile.py index f0c556ea1..aa1b72f24 100644 --- a/keopscore/keopscore/windows_compilations/compile.py +++ b/keopscore/keopscore/windows_compilations/compile.py @@ -6,15 +6,14 @@ from .globals import tmp_dir -path_type = str | os.PathLike | bytes _empty_list = [] def compile( - source_file: path_type, + source_file: os.PathLike, project_name: str | None = None, - includes: list[path_type] | None = _empty_list, - link_dirs: list[path_type] | None = _empty_list, + includes: list[os.PathLike] | None = _empty_list, + link_dirs: list[os.PathLike] | None = _empty_list, links: list[str] | None = _empty_list, macros: list[str] | None = _empty_list, suffix: str = ".dll", From 3144bad1b8a4d180337bd4e822170a0c1254f6f2 Mon Sep 17 00:00:00 2001 From: Louis Pujol <52413616+Louis-Pujol@users.noreply.github.com> Date: Fri, 11 Jul 2025 20:28:53 +0200 Subject: [PATCH 19/25] fix annotations issues in compile.py --- keopscore/keopscore/windows_compilations/compile.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/keopscore/keopscore/windows_compilations/compile.py b/keopscore/keopscore/windows_compilations/compile.py index aa1b72f24..139dcb5e9 100644 --- a/keopscore/keopscore/windows_compilations/compile.py +++ b/keopscore/keopscore/windows_compilations/compile.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import os import shutil import sysconfig @@ -6,14 +8,15 @@ from .globals import tmp_dir +path_type = str | os.PathLike _empty_list = [] def compile( - source_file: os.PathLike, + source_file: path_type, project_name: str | None = None, - includes: list[os.PathLike] | None = _empty_list, - link_dirs: list[os.PathLike] | None = _empty_list, + includes: list[path_type] | None = _empty_list, + link_dirs: list[path_type] | None = _empty_list, links: list[str] | None = _empty_list, macros: list[str] | None = _empty_list, suffix: str = ".dll", From f3813c66b32828ae16d7087ea301adbf9de80904 Mon Sep 17 00:00:00 2001 From: Louis Pujol <52413616+Louis-Pujol@users.noreply.github.com> Date: Fri, 11 Jul 2025 20:32:43 +0200 Subject: [PATCH 20/25] fix typing for paths --- keopscore/keopscore/windows_compilations/compile.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/keopscore/keopscore/windows_compilations/compile.py b/keopscore/keopscore/windows_compilations/compile.py index 139dcb5e9..9ee91f622 100644 --- a/keopscore/keopscore/windows_compilations/compile.py +++ b/keopscore/keopscore/windows_compilations/compile.py @@ -8,15 +8,13 @@ from .globals import tmp_dir -path_type = str | os.PathLike _empty_list = [] - def compile( - source_file: path_type, + source_file: os.PathLike, project_name: str | None = None, - includes: list[path_type] | None = _empty_list, - link_dirs: list[path_type] | None = _empty_list, + includes: list[os.PathLike] | None = _empty_list, + link_dirs: list[os.PathLike] | None = _empty_list, links: list[str] | None = _empty_list, macros: list[str] | None = _empty_list, suffix: str = ".dll", From eb0e52aeaeabd93bb5c229fa521d88f46230e5e9 Mon Sep 17 00:00:00 2001 From: Louis Pujol <52413616+Louis-Pujol@users.noreply.github.com> Date: Fri, 11 Jul 2025 20:38:09 +0200 Subject: [PATCH 21/25] remove dummy print --- keopscore/keopscore/config/cuda_windows.py | 1 - keopscore/keopscore/windows_compilations/compile.py | 1 + pykeops/pykeops/common/keops_io/LoadKeOps_nvrtc.py | 2 +- 3 files changed, 2 insertions(+), 2 deletions(-) diff --git a/keopscore/keopscore/config/cuda_windows.py b/keopscore/keopscore/config/cuda_windows.py index e23c4240b..47a8b4cee 100644 --- a/keopscore/keopscore/config/cuda_windows.py +++ b/keopscore/keopscore/config/cuda_windows.py @@ -54,7 +54,6 @@ def set_use_cuda(self): self._use_cuda = cuda_detection.cuda_available if not self._use_cuda: - print("fzefgrrtg") self.cuda_message = "CUDA libraries not detected; Switching to CPU only." KeOps_Warning(self.cuda_message) diff --git a/keopscore/keopscore/windows_compilations/compile.py b/keopscore/keopscore/windows_compilations/compile.py index 9ee91f622..d01699e4a 100644 --- a/keopscore/keopscore/windows_compilations/compile.py +++ b/keopscore/keopscore/windows_compilations/compile.py @@ -10,6 +10,7 @@ _empty_list = [] + def compile( source_file: os.PathLike, project_name: str | None = None, diff --git a/pykeops/pykeops/common/keops_io/LoadKeOps_nvrtc.py b/pykeops/pykeops/common/keops_io/LoadKeOps_nvrtc.py index f46aed6c1..651d8d9b7 100644 --- a/pykeops/pykeops/common/keops_io/LoadKeOps_nvrtc.py +++ b/pykeops/pykeops/common/keops_io/LoadKeOps_nvrtc.py @@ -66,7 +66,7 @@ def call_keops(self, nx, ny): self.params.dimsp, self.ranges_ptr_new, self.outshape, - self.out_ptr if os.name != "nt" else (self.out_ptr, ), + self.out_ptr if os.name != "nt" else (self.out_ptr,), self.args_ptr_new, self.argshapes_new, ) From 192c0dfe95943dc7bb00bd3a33bbb6e4c5baa9ec Mon Sep 17 00:00:00 2001 From: Louis Pujol <52413616+Louis-Pujol@users.noreply.github.com> Date: Fri, 19 Sep 2025 17:25:58 +0200 Subject: [PATCH 22/25] use cuda-pathfinder to find cuda dlls on windows --- keopscore/keopscore/config/cuda_windows.py | 2 +- .../compile_pykeops_nvrtc.py | 12 +- .../windows_compilations/cuda_detection.py | 10 +- .../cuda_detection/CMakeLists.txt | 127 ++++++++++++++++++ keopscore/setup.py | 5 +- 5 files changed, 144 insertions(+), 12 deletions(-) create mode 100644 keopscore/keopscore/windows_compilations/cuda_detection/CMakeLists.txt diff --git a/keopscore/keopscore/config/cuda_windows.py b/keopscore/keopscore/config/cuda_windows.py index 47a8b4cee..0660ea010 100644 --- a/keopscore/keopscore/config/cuda_windows.py +++ b/keopscore/keopscore/config/cuda_windows.py @@ -201,7 +201,7 @@ def safe_call(dev_idx, result_code): return (self.n_gpus, self.gpu_compile_flags) SharedMemPerBlock[d] = output.value - # Build compile flags string + # Build compile flags string #TODO auto detection for macros is here self.gpu_compile_flags = f"-DMAXIDGPU={self.n_gpus - 1} " for d in range(self.n_gpus): self.gpu_compile_flags += ( diff --git a/keopscore/keopscore/windows_compilations/compile_pykeops_nvrtc.py b/keopscore/keopscore/windows_compilations/compile_pykeops_nvrtc.py index e27a09c3e..01478569d 100644 --- a/keopscore/keopscore/windows_compilations/compile_pykeops_nvrtc.py +++ b/keopscore/keopscore/windows_compilations/compile_pykeops_nvrtc.py @@ -15,12 +15,12 @@ def compile_pykeops_nvrtc(build_folder): source_file = pykeops_dir / "common" / "keops_io" / "pykeops_nvrtc_win.cpp" macros = [ - "-DMAXIDGPU=0", - "-DMAXTHREADSPERBLOCK0=1024", - "-DSHAREDMEMPERBLOCK0=49152", - "-DnvrtcGetTARGET=nvrtcGetCUBIN", - "-DnvrtcGetTARGETSize=nvrtcGetCUBINSize", - '-DARCHTAG="sm"', + "-DMAXIDGPU=0", #TODO should be detected + "-DMAXTHREADSPERBLOCK0=1024", #TODO should be detected + "-DSHAREDMEMPERBLOCK0=49152", #TODO should be detected + "-DnvrtcGetTARGET=nvrtcGetCUBIN", #TODO should be detected + "-DnvrtcGetTARGETSize=nvrtcGetCUBINSize", #TODO should be detected + '-DARCHTAG="sm"', #TODO should be detected ] compile( diff --git a/keopscore/keopscore/windows_compilations/cuda_detection.py b/keopscore/keopscore/windows_compilations/cuda_detection.py index c6de764c3..62f56af40 100644 --- a/keopscore/keopscore/windows_compilations/cuda_detection.py +++ b/keopscore/keopscore/windows_compilations/cuda_detection.py @@ -2,8 +2,9 @@ from ctypes.util import find_library from pathlib import Path -cuda_available = "CUDA_PATH" in os.environ +from cuda import pathfinder +cuda_available = "CUDA_PATH" in os.environ def detect_cuda_toolkit(): @@ -14,12 +15,13 @@ def detect_cuda_toolkit(): cuda_path = Path( os.environ["CUDA_PATH"] ) # base path for cuda installation (including bin, lib, include, etc.) - cuda_bin = Path(cuda_path, "bin") # where the dlls are located - if find_library("nvcuda") is not None: + if find_library("nvcuda") is not None: # NVCUDA is the main CUDA driver library output["dll_cuda"] = find_library("nvcuda") - for file in cuda_bin.iterdir(): + cuda_dlls_dir = Path(pathfinder.load_nvidia_dynamic_lib("cudart").abs_path).parent + + for file in cuda_dlls_dir.iterdir(): if file.name.startswith("cudart") and file.name.endswith(".dll"): output["dll_cudart"] = str(file) diff --git a/keopscore/keopscore/windows_compilations/cuda_detection/CMakeLists.txt b/keopscore/keopscore/windows_compilations/cuda_detection/CMakeLists.txt new file mode 100644 index 000000000..30cca9752 --- /dev/null +++ b/keopscore/keopscore/windows_compilations/cuda_detection/CMakeLists.txt @@ -0,0 +1,127 @@ +cmake_minimum_required(VERSION 3.21) +project(cuda_detection LANGUAGES CXX) + +# If you have multiple toolkits, pass -DCUDAToolkit_ROOT="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.6" +find_package(CUDAToolkit REQUIRED) + +# --- Derive include and lib dirs ------------------------------------------------ +# include_dir: usually one path +list(GET CUDAToolkit_INCLUDE_DIRS 0 CUDA_DETECT_INCLUDE_DIR) + +# lib_dirs: prefer .../lib/x64 if present, else CUDAToolkit_LIBRARY_DIR +set(CUDA_DETECT_LIB_DIR "${CUDAToolkit_LIBRARY_DIR}") +if(EXISTS "${CUDAToolkit_LIBRARY_DIR}/x64") + set(CUDA_DETECT_LIB_DIR "${CUDAToolkit_LIBRARY_DIR}/x64") +endif() + +# bin dir for DLLs +set(_BIN "${CUDAToolkit_BIN_DIR}") + +# --- Version tag for NVRTC-builtins (12.6 -> 126) ------------------------------ +string(REPLACE "." ";" _ver_list "${CUDAToolkit_VERSION}") +list(GET _ver_list 0 _maj) +list(LENGTH _ver_list _len) +if(_len GREATER 1) + list(GET _ver_list 1 _min) +else() + set(_min 0) +endif() +math(EXPR _TAG "${_maj} * 10 + ${_min}") + +# --- dll_cuda (driver DLL) ----------------------------------------------------- +# Prefer System32 (64-bit) +find_file(CUDA_DETECT_DLL_CUDA + NAMES nvcuda.dll + PATHS "C:/Windows/System32" "C:/Windows/SysWOW64" # fallback shows 32-bit if only that exists + NO_DEFAULT_PATH) + +# --- dll_cudart (runtime DLL) -------------------------------------------------- +# Try versioned names first, then any cudart64*.dll in the toolkit bin +set(CUDA_DETECT_DLL_CUDART "") +foreach(_cand + "cudart64_${_maj}${_min}.dll" + "cudart64.dll") + find_file(CUDA_DETECT_DLL_CUDART NAMES "${_cand}" HINTS "${_BIN}" NO_DEFAULT_PATH) + if(CUDA_DETECT_DLL_CUDART) + break() + endif() +endforeach() +if(NOT CUDA_DETECT_DLL_CUDART) + file(GLOB _cudart_glob LIST_DIRECTORIES OFF "${_BIN}/cudart64*.dll") + if(_cudart_glob) + list(GET _cudart_glob 0 CUDA_DETECT_DLL_CUDART) + endif() +endif() + +# --- dll_nvrtc (you asked for the *builtins* DLL specifically) ----------------- +# Search for nvrtc-builtins matching our tag, else first available +set(CUDA_DETECT_DLL_NVRTC "") +file(GLOB _builtins_glob LIST_DIRECTORIES OFF "${_BIN}/nvrtc-builtins*.dll") +if(_builtins_glob) + foreach(_dll IN LISTS _builtins_glob) + get_filename_component(_name "${_dll}" NAME) + if(_name MATCHES "nvrtc-builtins.*${_TAG}") + set(CUDA_DETECT_DLL_NVRTC "${_dll}") + break() + endif() + endforeach() + if(NOT CUDA_DETECT_DLL_NVRTC) + list(GET _builtins_glob 0 CUDA_DETECT_DLL_NVRTC) + endif() +endif() + +# --- Sanity hints (optional warnings) ----------------------------------------- +if(NOT CUDA_DETECT_DLL_CUDA) + message(WARNING "nvcuda.dll not found in System32/SysWOW64.") +endif() +if(NOT CUDA_DETECT_DLL_CUDART) + message(WARNING "cudart64*.dll not found in ${_BIN}.") +endif() +if(NOT CUDA_DETECT_DLL_NVRTC) + message(WARNING "nvrtc-builtins*.dll not found in ${_BIN}.") +endif() + +# --- Produce outputs in your desired schema ----------------------------------- +# lib_names are the import-library basenames +set(_LIB_NAMES_PY "{'cuda': 'cuda', 'cudart': 'cudart', 'nvrtc': 'nvrtc'}") + +# Escape backslashes for Python/JSON strings +function(_escape_for_py in_var out_var) + string(REPLACE "\\" "\\\\" _tmp "${${in_var}}") + set(${out_var} "${_tmp}" PARENT_SCOPE) +endfunction() + +_escape_for_py(CUDA_DETECT_DLL_CUDA _DLL_CUDA_PY) +_escape_for_py(CUDA_DETECT_DLL_CUDART _DLL_CUDART_PY) +_escape_for_py(CUDA_DETECT_DLL_NVRTC _DLL_NVRTC_PY) +_escape_for_py(CUDA_DETECT_INCLUDE_DIR _INCLUDE_PY) +_escape_for_py(CUDA_DETECT_LIB_DIR _LIBDIR_PY) + +# Python-style dict (exact key names as you requested) +set(_DICT_PY + "{'dll_cuda': '${_DLL_CUDA_PY}', " + "'dll_cudart': '${_DLL_CUDART_PY}', " + "'dll_nvrtc': '${_DLL_NVRTC_PY}', " + "'include_dir': '${_INCLUDE_PY}', " + "'lib_dirs': '${_LIBDIR_PY}', " + "'lib_names': ${_LIB_NAMES_PY}}" +) + +message(STATUS "CUDA_DETECTION=${_DICT_PY}") + +# Also emit a JSON file (cuda_detect.json) in the build dir for programmatic use +# Note: using the same values; JSON requires quoted keys and escaped paths. +set(_JSON + "{\n" + " \"dll_cuda\": \"${_DLL_CUDA_PY}\",\n" + " \"dll_cudart\": \"${_DLL_CUDART_PY}\",\n" + " \"dll_nvrtc\": \"${_DLL_NVRTC_PY}\",\n" + " \"include_dir\": \"${_INCLUDE_PY}\",\n" + " \"lib_dirs\": \"${_LIBDIR_PY}\",\n" + " \"lib_names\": {\"cuda\": \"cuda\", \"cudart\": \"cudart\", \"nvrtc\": \"nvrtc\"}\n" + "}\n") +file(WRITE "${CMAKE_BINARY_DIR}/cuda_detect.json" "${_JSON}") + +# Dummy target so configure+build does something +add_custom_target(show_detect ALL + COMMENT "CUDA detection complete. See message above or cuda_detect.json.") diff --git a/keopscore/setup.py b/keopscore/setup.py index a68664515..639e1b565 100644 --- a/keopscore/setup.py +++ b/keopscore/setup.py @@ -94,6 +94,9 @@ "windows_compilations/templates/CMakeLists.txt", ], }, - install_requires=[], + install_requires=[ + "cuda-pathfinder", + "cmake" + ], extras_require={}, ) From 58cb294b18e32b4c59b8553c7407a1712fb8e8d7 Mon Sep 17 00:00:00 2001 From: Louis Pujol <52413616+Louis-Pujol@users.noreply.github.com> Date: Fri, 19 Sep 2025 17:27:39 +0200 Subject: [PATCH 23/25] lint --- .../windows_compilations/compile_pykeops_nvrtc.py | 12 ++++++------ .../keopscore/windows_compilations/cuda_detection.py | 7 +++++-- keopscore/setup.py | 5 +---- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/keopscore/keopscore/windows_compilations/compile_pykeops_nvrtc.py b/keopscore/keopscore/windows_compilations/compile_pykeops_nvrtc.py index 01478569d..361f00362 100644 --- a/keopscore/keopscore/windows_compilations/compile_pykeops_nvrtc.py +++ b/keopscore/keopscore/windows_compilations/compile_pykeops_nvrtc.py @@ -15,12 +15,12 @@ def compile_pykeops_nvrtc(build_folder): source_file = pykeops_dir / "common" / "keops_io" / "pykeops_nvrtc_win.cpp" macros = [ - "-DMAXIDGPU=0", #TODO should be detected - "-DMAXTHREADSPERBLOCK0=1024", #TODO should be detected - "-DSHAREDMEMPERBLOCK0=49152", #TODO should be detected - "-DnvrtcGetTARGET=nvrtcGetCUBIN", #TODO should be detected - "-DnvrtcGetTARGETSize=nvrtcGetCUBINSize", #TODO should be detected - '-DARCHTAG="sm"', #TODO should be detected + "-DMAXIDGPU=0", # TODO should be detected + "-DMAXTHREADSPERBLOCK0=1024", # TODO should be detected + "-DSHAREDMEMPERBLOCK0=49152", # TODO should be detected + "-DnvrtcGetTARGET=nvrtcGetCUBIN", # TODO should be detected + "-DnvrtcGetTARGETSize=nvrtcGetCUBINSize", # TODO should be detected + '-DARCHTAG="sm"', # TODO should be detected ] compile( diff --git a/keopscore/keopscore/windows_compilations/cuda_detection.py b/keopscore/keopscore/windows_compilations/cuda_detection.py index 62f56af40..18554bb62 100644 --- a/keopscore/keopscore/windows_compilations/cuda_detection.py +++ b/keopscore/keopscore/windows_compilations/cuda_detection.py @@ -6,6 +6,7 @@ cuda_available = "CUDA_PATH" in os.environ + def detect_cuda_toolkit(): output = {} @@ -16,10 +17,12 @@ def detect_cuda_toolkit(): os.environ["CUDA_PATH"] ) # base path for cuda installation (including bin, lib, include, etc.) - if find_library("nvcuda") is not None: # NVCUDA is the main CUDA driver library + if find_library("nvcuda") is not None: # NVCUDA is the main CUDA driver library output["dll_cuda"] = find_library("nvcuda") - cuda_dlls_dir = Path(pathfinder.load_nvidia_dynamic_lib("cudart").abs_path).parent + cuda_dlls_dir = Path( + pathfinder.load_nvidia_dynamic_lib("cudart").abs_path + ).parent for file in cuda_dlls_dir.iterdir(): diff --git a/keopscore/setup.py b/keopscore/setup.py index 639e1b565..e4fca04d2 100644 --- a/keopscore/setup.py +++ b/keopscore/setup.py @@ -94,9 +94,6 @@ "windows_compilations/templates/CMakeLists.txt", ], }, - install_requires=[ - "cuda-pathfinder", - "cmake" - ], + install_requires=["cuda-pathfinder", "cmake"], extras_require={}, ) From 7ba96196367446b0d528920b92d2c47d3ad489c5 Mon Sep 17 00:00:00 2001 From: Louis Pujol Date: Fri, 19 Sep 2025 17:51:09 +0200 Subject: [PATCH 24/25] remove cuda pathfinder, check x86 folder for cuda dlls --- .../windows_compilations/cuda_detection.py | 54 ++++++++++++++----- keopscore/setup.py | 2 +- 2 files changed, 41 insertions(+), 15 deletions(-) diff --git a/keopscore/keopscore/windows_compilations/cuda_detection.py b/keopscore/keopscore/windows_compilations/cuda_detection.py index 18554bb62..1a8e7db11 100644 --- a/keopscore/keopscore/windows_compilations/cuda_detection.py +++ b/keopscore/keopscore/windows_compilations/cuda_detection.py @@ -1,9 +1,21 @@ +r""" +CUDA toolkit detection on Windows. + +CUDA_PATH environment variable must be set. It is usually set by the CUDA installer, if not it +must point to a valid CUDA installation (typically C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\vX.Y). + +The detection looks for the following files: +- cudart*.dll +- nvrtc-builtins*.dll +- nvcuda.dll (CUDA driver library, usually located in system32 folder) +- include directory +- lib/x64 directory containing cuda.lib, nvrtc.lib and cudart.lib +""" + import os from ctypes.util import find_library from pathlib import Path -from cuda import pathfinder - cuda_available = "CUDA_PATH" in os.environ @@ -20,19 +32,30 @@ def detect_cuda_toolkit(): if find_library("nvcuda") is not None: # NVCUDA is the main CUDA driver library output["dll_cuda"] = find_library("nvcuda") - cuda_dlls_dir = Path( - pathfinder.load_nvidia_dynamic_lib("cudart").abs_path - ).parent - - for file in cuda_dlls_dir.iterdir(): - - if file.name.startswith("cudart") and file.name.endswith(".dll"): - output["dll_cudart"] = str(file) - - if file.name.startswith("nvrtc-builtins") and file.name.endswith(".dll"): - output["dll_nvrtc"] = str(file) + cuda_path = Path( + os.environ["CUDA_PATH"] + ) # base path for cuda installation (including bin, lib, include, etc.) - # See the files in Path(cuda_path, "cmake") for something more automatic here + ################################################# + # Detect relevant DLLs: cudart and nvrtc-builtins + ################################################# + + # Check both bin and bin/x86 directories for relevant DLLs + bin_dirs = [Path(cuda_path, "bin"), Path(cuda_path, "bin", "x86")] + + for bin_dir in bin_dirs: + if bin_dir.is_dir(): + for file in bin_dir.iterdir(): + if file.name.startswith("cudart") and file.name.endswith(".dll"): + output["dll_cudart"] = str(file) + if file.name.startswith("nvrtc-builtins") and file.name.endswith( + ".dll" + ): + output["dll_nvrtc"] = str(file) + + ################################################# + # Detect include and lib directories + ################################################# cuda_include = Path(cuda_path, "include") if cuda_include.is_dir(): output["include_dir"] = str(cuda_include) @@ -41,6 +64,9 @@ def detect_cuda_toolkit(): if cuda_libs.is_dir(): output["lib_dirs"] = str(cuda_libs) + ################################################# + # Make sure that cudart, nvrtc and cuda libs are available + ################################################# output["lib_names"] = {} for key in ["cuda", "nvrtc", "cudart"]: diff --git a/keopscore/setup.py b/keopscore/setup.py index e4fca04d2..77c9eaaa8 100644 --- a/keopscore/setup.py +++ b/keopscore/setup.py @@ -94,6 +94,6 @@ "windows_compilations/templates/CMakeLists.txt", ], }, - install_requires=["cuda-pathfinder", "cmake"], + install_requires=["cmake"], extras_require={}, ) From e890b208a4b5fd8e4e69d6c4314ee6656797ed94 Mon Sep 17 00:00:00 2001 From: Louis Pujol Date: Fri, 19 Sep 2025 17:57:03 +0200 Subject: [PATCH 25/25] typo x86 -> x64 --- keopscore/keopscore/windows_compilations/cuda_detection.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/keopscore/keopscore/windows_compilations/cuda_detection.py b/keopscore/keopscore/windows_compilations/cuda_detection.py index 1a8e7db11..3f4b8f63d 100644 --- a/keopscore/keopscore/windows_compilations/cuda_detection.py +++ b/keopscore/keopscore/windows_compilations/cuda_detection.py @@ -40,8 +40,8 @@ def detect_cuda_toolkit(): # Detect relevant DLLs: cudart and nvrtc-builtins ################################################# - # Check both bin and bin/x86 directories for relevant DLLs - bin_dirs = [Path(cuda_path, "bin"), Path(cuda_path, "bin", "x86")] + # Check both bin and bin/x64 directories for relevant DLLs + bin_dirs = [Path(cuda_path, "bin"), Path(cuda_path, "bin", "x64")] for bin_dir in bin_dirs: if bin_dir.is_dir():