From 2460b3aa5dc0e90ad01f5159ebc581f5bb1cf9b0 Mon Sep 17 00:00:00 2001
From: Louis Pujol <Louis.Pujol@3ds.com>
Date: Mon, 19 May 2025 15:26:11 +0200
Subject: [PATCH 01/25] remove unused imports

---
 pykeops/pykeops/common/utils.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/pykeops/pykeops/common/utils.py b/pykeops/pykeops/common/utils.py
index 104485ee9..d4d37fe7d 100644
--- a/pykeops/pykeops/common/utils.py
+++ b/pykeops/pykeops/common/utils.py
@@ -1,8 +1,3 @@
-import fcntl
-import functools
-import importlib.util
-import os
-
 import pykeops.config
 
 c_type = dict(float16="half2", float32="float", float64="double")

From 143537cb95876a611dcd4f5a2d2d883ef4b82355 Mon Sep 17 00:00:00 2001
From: Louis Pujol <52413616+Louis-Pujol@users.noreply.github.com>
Date: Mon, 19 May 2025 16:06:46 +0200
Subject: [PATCH 02/25] add windows_compilations subpackage to keopscore

---
 .../windows_compilations/__init__.py          |  27 ++++
 .../keopscore/windows_compilations/compile.py | 126 ++++++++++++++++++
 .../windows_compilations/compile_nvrtc_jit.py |  44 ++++++
 .../compile_pykeops_cpp_module.py             |  34 +++++
 .../compile_pykeops_nvrtc.py                  |  44 ++++++
 .../windows_compilations/cuda_detection.py    |  42 ++++++
 .../windows_compilations/detection.py         |  64 +++++++++
 .../keopscore/windows_compilations/globals.py |   5 +
 .../templates/CMakeLists.txt                  |  39 ++++++
 .../keopscore/windows_compilations/utils.py   |  31 +++++
 10 files changed, 456 insertions(+)
 create mode 100644 keopscore/keopscore/windows_compilations/__init__.py
 create mode 100644 keopscore/keopscore/windows_compilations/compile.py
 create mode 100644 keopscore/keopscore/windows_compilations/compile_nvrtc_jit.py
 create mode 100644 keopscore/keopscore/windows_compilations/compile_pykeops_cpp_module.py
 create mode 100644 keopscore/keopscore/windows_compilations/compile_pykeops_nvrtc.py
 create mode 100644 keopscore/keopscore/windows_compilations/cuda_detection.py
 create mode 100644 keopscore/keopscore/windows_compilations/detection.py
 create mode 100644 keopscore/keopscore/windows_compilations/globals.py
 create mode 100644 keopscore/keopscore/windows_compilations/templates/CMakeLists.txt
 create mode 100644 keopscore/keopscore/windows_compilations/utils.py

diff --git a/keopscore/keopscore/windows_compilations/__init__.py b/keopscore/keopscore/windows_compilations/__init__.py
new file mode 100644
index 000000000..985c7f110
--- /dev/null
+++ b/keopscore/keopscore/windows_compilations/__init__.py
@@ -0,0 +1,27 @@
+"""Initialize the windows_compilations package and create dictionaries with found includes/libs/dlls"""
+
+from .compile import compile
+from .compile_nvrtc_jit import compile_nvrtc_jit
+from .compile_pykeops_cpp_module import compile_pykeops_cpp_module
+from .compile_pykeops_nvrtc import compile_pykeops_nvrtc
+from .detection import (
+    dlls,
+    include_dirs,
+    keops_available,
+    lib_dirs,
+    lib_names,
+)
+from .globals import tmp_dir
+
+__all__ = [
+    "compile",
+    "tmp_dir",
+    "include_dirs",
+    "lib_dirs",
+    "lib_names",
+    "dlls",
+    "compile_pykeops_cpp_module",
+    "compile_nvrtc_jit",
+    "compile_pykeops_nvrtc",
+    "keops_available",
+    ]
diff --git a/keopscore/keopscore/windows_compilations/compile.py b/keopscore/keopscore/windows_compilations/compile.py
new file mode 100644
index 000000000..d07058500
--- /dev/null
+++ b/keopscore/keopscore/windows_compilations/compile.py
@@ -0,0 +1,126 @@
+import os
+import shutil
+import sysconfig
+import uuid
+from pathlib import Path
+
+from .globals import tmp_dir
+
+path_type = str | os.PathLike | bytes
+_empty_list = []
+
+def compile(
+        source_file: path_type,
+        project_name: str | None = None,
+        includes: list[path_type] | None = _empty_list,
+        link_dirs: list[path_type] | None = _empty_list,
+        links: list[str] | None = _empty_list,
+        macros: list[str] | None = _empty_list,
+        suffix: str = ".dll",
+        output_dir=".",
+        print_cmakelists=False,
+        show_cmake_commands_output=False,
+        clean_tmp_build_dir=True,
+        ):
+
+    output_dir = Path(output_dir)
+
+    if project_name is None:
+        project_name = "".join(Path(source_file).name.split(".")[:-1])
+
+    includes_str = ""
+    for include in includes:
+        includes_str += f"include_directories(\"{include!s}\")\n"
+    includes_str = includes_str.replace("\\", "/")
+
+    link_dirs_str = ""
+    for link in link_dirs:
+        link_dirs_str += f"link_directories(\"{link!s}\")\n"
+    link_dirs_str = link_dirs_str.replace("\\", "/")
+
+    macros_str = ""
+    for macro in macros:
+        macros_str += f"add_definitions({macro!s})\n"
+
+    if len(links) == 0:
+        links = ""
+
+    else:
+        inline_links = " ".join(links)
+        links = f"target_link_libraries(${{PROJECT_NAME}} {inline_links})"
+
+    with Path.open(Path(__file__).parent / "templates" / "CMakeLists.txt") as f:
+        content = f.read()
+
+    includes = includes_str
+    link_dirs = link_dirs_str
+    macros = macros_str
+    source_file = str(Path(source_file).resolve()).replace("\\", "/")
+
+    if "pyd" in suffix:
+        suffix = sysconfig.get_config_var('EXT_SUFFIX')
+
+    fields = ["source_file", "project_name", "includes", "link_dirs", "links", "macros", "suffix"]
+
+    for field in fields:
+        content = content.replace(
+            f"***{field}***",
+            locals()[field]
+        )
+
+
+    import os
+    import subprocess
+
+    cwd = Path.cwd()
+
+    # Go to tmp dir
+
+    tmp_build = tmp_dir / str(uuid.uuid4())
+
+    tmp_build.mkdir()
+    if (tmp_build / "build").is_dir():
+        shutil.rmtree(tmp_build / "build")
+
+    tmp_build.mkdir(exist_ok=True)
+
+    with Path.open(Path(tmp_build) / "CMakeLists.txt", "w") as f:
+        f.write(content)
+
+    if print_cmakelists:
+        print(content)
+
+    os.chdir(tmp_build)
+    Path("build").mkdir()
+    os.chdir("build")
+
+    if not show_cmake_commands_output:
+
+        with Path.open(tmp_build / "log", "w") as log_file:
+
+            subprocess.check_call(["cmake", ".."], stdout=log_file)
+            subprocess.check_call(["cmake", "--build", ".", "--config", "Release"], stdout=log_file)
+
+    else:
+        subprocess.check_call(["cmake", ".."])
+        subprocess.check_call(["cmake", "--build", ".", "--config", "Release"])
+
+    # Back to previous working directory
+    os.chdir(cwd)
+
+    # Define the source and destination directories
+    Path(output_dir).mkdir(exist_ok=True)
+
+    source_dir = tmp_build / "build" / "Release"
+
+    # Copy the contents of the source directory to the destination directory
+    for item in os.listdir(source_dir):
+        s = source_dir / item
+        d = output_dir / item
+
+        # Copy files or directories
+        if str(s).endswith(suffix):
+            shutil.copy2(s, d)
+
+    if clean_tmp_build_dir:
+        shutil.rmtree(tmp_build)
diff --git a/keopscore/keopscore/windows_compilations/compile_nvrtc_jit.py b/keopscore/keopscore/windows_compilations/compile_nvrtc_jit.py
new file mode 100644
index 000000000..5317cca21
--- /dev/null
+++ b/keopscore/keopscore/windows_compilations/compile_nvrtc_jit.py
@@ -0,0 +1,44 @@
+from pathlib import Path
+
+from .compile import compile
+from .detection import (
+    include_dirs,
+    lib_dirs,
+    lib_names,
+)
+from .utils import find_package_location
+
+
+def compile_nvrtc_jit(build_folder):
+
+    keops_dir = Path(find_package_location("keopscore")).parent
+    source_file = keops_dir / "binders" / "nvrtc" / "nvrtc_jit.cpp"
+
+
+    macros = [
+        "-DMAXIDGPU=0",
+        "-DMAXTHREADSPERBLOCK0=1024",
+        "-DSHAREDMEMPERBLOCK0=49152",
+        "-DnvrtcGetTARGET=nvrtcGetCUBIN",
+        "-DnvrtcGetTARGETSize=nvrtcGetCUBINSize",
+        "-DARCHTAG=\"sm\""
+    ]
+
+
+    compile(
+        source_file=source_file,
+        macros=macros,
+        includes= [
+            include_dirs[key] for key in ["keops", "cuda"]
+        ],
+        link_dirs=[
+            lib_dirs[key] for key in ["cuda"]
+        ],
+        links=[
+            lib_names[key] for key in ["cuda", "nvrtc", "cudart"]
+        ],
+        suffix='.dll',
+        output_dir=build_folder,
+        print_cmakelists=False,
+        show_cmake_commands_output=False
+    )
diff --git a/keopscore/keopscore/windows_compilations/compile_pykeops_cpp_module.py b/keopscore/keopscore/windows_compilations/compile_pykeops_cpp_module.py
new file mode 100644
index 000000000..cb40f10b6
--- /dev/null
+++ b/keopscore/keopscore/windows_compilations/compile_pykeops_cpp_module.py
@@ -0,0 +1,34 @@
+from .compile import compile
+from .detection import include_dirs, lib_dirs, lib_names
+
+
+def compile_pykeops_cpp_module(source_file, build_folder):
+    """This built-in compilation config serves to compile pykeops cpp modules
+
+    Parameters
+    ----------
+    source_file
+        Location of the source cpp file
+    build_folder
+        The KeOps build folder
+
+    """
+
+    compile(
+        source_file=source_file,
+        includes= [
+            include_dirs["python"],
+            include_dirs["pybind11"],
+            include_dirs["keops"]
+        ],
+        link_dirs=[
+            lib_dirs["python"]
+        ],
+        links=[
+            lib_names["python"]
+        ],
+        suffix='.pyd',
+        output_dir=build_folder,
+        print_cmakelists=False,
+        show_cmake_commands_output=False
+    )
diff --git a/keopscore/keopscore/windows_compilations/compile_pykeops_nvrtc.py b/keopscore/keopscore/windows_compilations/compile_pykeops_nvrtc.py
new file mode 100644
index 000000000..fdae10309
--- /dev/null
+++ b/keopscore/keopscore/windows_compilations/compile_pykeops_nvrtc.py
@@ -0,0 +1,44 @@
+from pathlib import Path
+
+from .compile import compile
+from .detection import (
+    include_dirs,
+    lib_dirs,
+    lib_names,
+)
+from .utils import find_package_location
+
+
+def compile_pykeops_nvrtc(build_folder):
+
+    pykeops_dir = Path(find_package_location("pykeops")).parent
+    source_file = pykeops_dir / "common" / "keops_io" / "pykeops_nvrtc.cpp"
+
+
+    macros = [
+        "-DMAXIDGPU=0",
+        "-DMAXTHREADSPERBLOCK0=1024",
+        "-DSHAREDMEMPERBLOCK0=49152",
+        "-DnvrtcGetTARGET=nvrtcGetCUBIN",
+        "-DnvrtcGetTARGETSize=nvrtcGetCUBINSize",
+        "-DARCHTAG=\"sm\""
+    ]
+
+
+    compile(
+        source_file=source_file,
+        macros=macros,
+        includes= [
+            include_dirs[key] for key in ["python", "pybind11", "keops", "cuda"]
+        ],
+        link_dirs=[
+            lib_dirs[key] for key in ["python", "cuda"]
+        ],
+        links=[
+            lib_names[key] for key in ["cuda", "nvrtc", "cudart", "python"]
+        ],
+        suffix='.pyd',
+        output_dir=build_folder,
+        print_cmakelists=False,
+        show_cmake_commands_output=False,
+    )
diff --git a/keopscore/keopscore/windows_compilations/cuda_detection.py b/keopscore/keopscore/windows_compilations/cuda_detection.py
new file mode 100644
index 000000000..18d11c8bc
--- /dev/null
+++ b/keopscore/keopscore/windows_compilations/cuda_detection.py
@@ -0,0 +1,42 @@
+import os
+from ctypes.util import find_library
+from pathlib import Path
+
+
+def detect_cuda_toolkit():
+
+    if os.environ["CUDA_PATH"]:
+
+        output = {}
+
+        cuda_path = Path(os.environ["CUDA_PATH"]) # base path for cuda installation (including bin, lib, include, etc.)
+        cuda_bin = Path(cuda_path, "bin") # where the dlls are located
+
+        if find_library("nvcuda") is not None:
+            output["dll_cuda"] = find_library("nvcuda")
+
+        for file in cuda_bin.iterdir():
+
+            if file.name.startswith("cudart") and file.name.endswith(".dll"):
+                output["dll_cudart"] = str(file)
+
+            if file.name.startswith("nvrtc-builtins") and file.name.endswith(".dll"):
+                output["dll_nvrtc"] = str(file)
+
+        # See the files in Path(cuda_path, "cmake") for something more automatic here
+        cuda_include = Path(cuda_path, "include")
+        if cuda_include.is_dir():
+            output["include_dir"] = str(cuda_include)
+
+
+        cuda_libs = Path(cuda_path, "lib", "x64")
+        if cuda_libs.is_dir():
+            output["lib_dirs"] = str(cuda_libs)
+
+        output["lib_names"] = {}
+        for key in ["cuda", "nvrtc", "cudart"]:
+
+            if (cuda_libs / (key + ".lib")).is_file():
+                output["lib_names"][key] = key
+
+    return output
diff --git a/keopscore/keopscore/windows_compilations/detection.py b/keopscore/keopscore/windows_compilations/detection.py
new file mode 100644
index 000000000..f745f6dda
--- /dev/null
+++ b/keopscore/keopscore/windows_compilations/detection.py
@@ -0,0 +1,64 @@
+import sys
+import sysconfig
+from pathlib import Path
+
+import pybind11
+
+from .cuda_detection import detect_cuda_toolkit
+from .utils import find_package_location
+
+include_dirs = {}
+lib_dirs = {}
+lib_names = {}
+dlls = {}
+
+
+try:
+    location_keops_init = find_package_location("keopscore")
+    include_dirs["keops"] = str(Path(location_keops_init).parent)
+    keops_available = True
+except ImportError:
+    keops_available = False
+
+
+
+include_dirs["pybind11"] = pybind11.get_include()
+
+
+# Python
+include_dirs["python"] = sysconfig.get_path('include')
+
+if sys.platform == "win32":
+    # On Windows, get the path to the Python DLL
+    python_libs = Path(sysconfig.get_path('include')).parent / "libs"
+
+    # Get the path to the standard library (Lib)
+    if python_libs.is_dir():
+        lib_dirs["python"] = str(python_libs)
+
+        version = str(sys.version_info.major) + str(sys.version_info.minor)
+        if (Path(lib_dirs["python"]) / ("python" + version + ".lib")).is_file():
+            lib_names["python"] = "python" + version
+else:
+    # On Unix-like systems, use sysconfig to get the library directory and name
+    lib_dirs["python"] = Path(sysconfig.get_config_var('LIBDIR'))
+    lib_names["python"] = sysconfig.get_config_var('LDLIBRARY')
+
+
+
+# Cuda
+cuda_config = detect_cuda_toolkit()
+
+for key in ["cuda", "nvrtc", "cudart"]:
+
+    if f"dll_{key}" in cuda_config:
+        dlls[key] = cuda_config[f"dll_{key}"]
+
+    if key in cuda_config["lib_names"]:
+        lib_names[key] = cuda_config["lib_names"][key]
+
+if "include_dir" in cuda_config:
+    include_dirs["cuda"] = cuda_config["include_dir"]
+
+if "lib_dirs" in cuda_config:
+    lib_dirs["cuda"] = cuda_config["lib_dirs"]
diff --git a/keopscore/keopscore/windows_compilations/globals.py b/keopscore/keopscore/windows_compilations/globals.py
new file mode 100644
index 000000000..9eb9504c8
--- /dev/null
+++ b/keopscore/keopscore/windows_compilations/globals.py
@@ -0,0 +1,5 @@
+from pathlib import Path
+
+tmp_dir = Path.home() / "tmp" / "cppcompile_windows"
+(Path.home() / "tmp").mkdir(exist_ok=True)
+tmp_dir.mkdir(exist_ok=True)
diff --git a/keopscore/keopscore/windows_compilations/templates/CMakeLists.txt b/keopscore/keopscore/windows_compilations/templates/CMakeLists.txt
new file mode 100644
index 000000000..458a6b5af
--- /dev/null
+++ b/keopscore/keopscore/windows_compilations/templates/CMakeLists.txt
@@ -0,0 +1,39 @@
+cmake_minimum_required(VERSION 3.10)
+project(***project_name***)
+
+# Set C++ standard
+set(CMAKE_CXX_STANDARD 11)
+
+# Set compiler options
+if(MSVC)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /LD /Ox /GL /openmp")
+else()
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -O3 -fopenmp")
+endif()
+
+# Preprocessor macros
+***macros***
+
+# Include directories
+***includes***
+
+# Link directories
+***link_dirs***
+
+# Source file
+set(SOURCE_FILE "***source_file***")
+
+# Add shared library
+add_library(${PROJECT_NAME} SHARED ${SOURCE_FILE})
+
+# Link
+***links***
+
+# Set the output path for the shared library and ensure the .pyd extension
+set_target_properties(${PROJECT_NAME} PROPERTIES
+    LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
+    OUTPUT_NAME "***project_name***"
+    SUFFIX "***suffix***"  # Ensure the correct file extension for Python modules
+    MSVC_WARNING_LEVEL 0
+    MSVC_WARNING_DISABLE 8029 # Warning because we are building in a tmp dir
+)
diff --git a/keopscore/keopscore/windows_compilations/utils.py b/keopscore/keopscore/windows_compilations/utils.py
new file mode 100644
index 000000000..320a06009
--- /dev/null
+++ b/keopscore/keopscore/windows_compilations/utils.py
@@ -0,0 +1,31 @@
+import importlib.util
+
+
+def find_package_location(package_name: str) -> str:
+    """Find the __init__ file of a given package
+
+    This function does not import the package, it was written to avoid circular
+    imports with KeOps
+
+    Parameters
+    ----------
+    package_name
+        The name of the package
+
+    Returns
+    -------
+    str
+        The path to the package
+
+    Raises
+    ------
+    ImportError
+        If the package cannot be loaded
+
+    """
+    spec = importlib.util.find_spec(package_name)
+    if spec.origin:
+        return spec.origin
+    else:
+        message =  f"Package '{package_name}' not found."
+        raise ImportError(message)

From c17d6e39f2a71332dd4f1a21ef392aa3f89cd911 Mon Sep 17 00:00:00 2001
From: Louis Pujol <52413616+Louis-Pujol@users.noreply.github.com>
Date: Mon, 19 May 2025 16:07:55 +0200
Subject: [PATCH 03/25] Use vectors in utils_pe.h

---
 keopscore/keopscore/include/utils_pe.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/keopscore/keopscore/include/utils_pe.h b/keopscore/keopscore/include/utils_pe.h
index 55c91babc..56ef5e99b 100644
--- a/keopscore/keopscore/include/utils_pe.h
+++ b/keopscore/keopscore/include/utils_pe.h
@@ -1,5 +1,6 @@
 #include <cuda.h>
 #include <numeric>
+#include <vector>
 
 #define NVRTC_SAFE_CALL(x)                                                     \
   do {                                                                         \
@@ -68,7 +69,8 @@ void load_args_FromHost(CUdeviceptr &p_data, TYPE *out, TYPE *&out_d, int nargs,
                         TYPE **arg, TYPE **&arg_d,
                         const std::vector<std::vector<signed long int>> &argshape,
                         signed long int sizeout) {
-  signed long int sizes[nargs];
+
+  std::vector<signed long int> sizes(nargs);
   signed long int totsize = sizeout;
   for (int k = 0; k < nargs; k++) {
     sizes[k] = std::accumulate(argshape[k].begin(), argshape[k].end(), 1,
@@ -83,7 +85,7 @@ void load_args_FromHost(CUdeviceptr &p_data, TYPE *out, TYPE *&out_d, int nargs,
   TYPE *dataloc = (TYPE *)(arg_d + nargs);
 
   // host array of pointers to device data
-  TYPE *ph[nargs];
+  std::vector<TYPE *> ph(nargs);
 
   out_d = dataloc;
   dataloc += sizeout;
@@ -95,5 +97,5 @@ void load_args_FromHost(CUdeviceptr &p_data, TYPE *out, TYPE *&out_d, int nargs,
   }
 
   // copy array of pointers
-  CUDA_SAFE_CALL(cuMemcpyHtoD((CUdeviceptr)arg_d, ph, nargs * sizeof(TYPE *)));
+  CUDA_SAFE_CALL(cuMemcpyHtoD((CUdeviceptr)arg_d, ph.data(), nargs * sizeof(TYPE *)));
 }

From 4abebfd0e88814f68d68b2cfb0a5a0d799280c80 Mon Sep 17 00:00:00 2001
From: Louis Pujol <52413616+Louis-Pujol@users.noreply.github.com>
Date: Mon, 19 May 2025 17:20:08 +0200
Subject: [PATCH 04/25] lint

---
 .../windows_compilations/__init__.py          |  2 +-
 .../keopscore/windows_compilations/compile.py | 53 +++++++++++--------
 .../windows_compilations/compile_nvrtc_jit.py | 20 +++----
 .../compile_pykeops_cpp_module.py             | 16 +++---
 .../compile_pykeops_nvrtc.py                  | 18 ++-----
 .../windows_compilations/cuda_detection.py    |  7 +--
 .../windows_compilations/detection.py         | 10 ++--
 .../keopscore/windows_compilations/utils.py   |  2 +-
 8 files changed, 57 insertions(+), 71 deletions(-)

diff --git a/keopscore/keopscore/windows_compilations/__init__.py b/keopscore/keopscore/windows_compilations/__init__.py
index 985c7f110..c8ba3a787 100644
--- a/keopscore/keopscore/windows_compilations/__init__.py
+++ b/keopscore/keopscore/windows_compilations/__init__.py
@@ -24,4 +24,4 @@
     "compile_nvrtc_jit",
     "compile_pykeops_nvrtc",
     "keops_available",
-    ]
+]
diff --git a/keopscore/keopscore/windows_compilations/compile.py b/keopscore/keopscore/windows_compilations/compile.py
index d07058500..f0c556ea1 100644
--- a/keopscore/keopscore/windows_compilations/compile.py
+++ b/keopscore/keopscore/windows_compilations/compile.py
@@ -9,19 +9,20 @@
 path_type = str | os.PathLike | bytes
 _empty_list = []
 
+
 def compile(
-        source_file: path_type,
-        project_name: str | None = None,
-        includes: list[path_type] | None = _empty_list,
-        link_dirs: list[path_type] | None = _empty_list,
-        links: list[str] | None = _empty_list,
-        macros: list[str] | None = _empty_list,
-        suffix: str = ".dll",
-        output_dir=".",
-        print_cmakelists=False,
-        show_cmake_commands_output=False,
-        clean_tmp_build_dir=True,
-        ):
+    source_file: path_type,
+    project_name: str | None = None,
+    includes: list[path_type] | None = _empty_list,
+    link_dirs: list[path_type] | None = _empty_list,
+    links: list[str] | None = _empty_list,
+    macros: list[str] | None = _empty_list,
+    suffix: str = ".dll",
+    output_dir=".",
+    print_cmakelists=False,
+    show_cmake_commands_output=False,
+    clean_tmp_build_dir=True,
+):
 
     output_dir = Path(output_dir)
 
@@ -30,12 +31,12 @@ def compile(
 
     includes_str = ""
     for include in includes:
-        includes_str += f"include_directories(\"{include!s}\")\n"
+        includes_str += f'include_directories("{include!s}")\n'
     includes_str = includes_str.replace("\\", "/")
 
     link_dirs_str = ""
     for link in link_dirs:
-        link_dirs_str += f"link_directories(\"{link!s}\")\n"
+        link_dirs_str += f'link_directories("{link!s}")\n'
     link_dirs_str = link_dirs_str.replace("\\", "/")
 
     macros_str = ""
@@ -58,16 +59,20 @@ def compile(
     source_file = str(Path(source_file).resolve()).replace("\\", "/")
 
     if "pyd" in suffix:
-        suffix = sysconfig.get_config_var('EXT_SUFFIX')
-
-    fields = ["source_file", "project_name", "includes", "link_dirs", "links", "macros", "suffix"]
+        suffix = sysconfig.get_config_var("EXT_SUFFIX")
+
+    fields = [
+        "source_file",
+        "project_name",
+        "includes",
+        "link_dirs",
+        "links",
+        "macros",
+        "suffix",
+    ]
 
     for field in fields:
-        content = content.replace(
-            f"***{field}***",
-            locals()[field]
-        )
-
+        content = content.replace(f"***{field}***", locals()[field])
 
     import os
     import subprocess
@@ -99,7 +104,9 @@ def compile(
         with Path.open(tmp_build / "log", "w") as log_file:
 
             subprocess.check_call(["cmake", ".."], stdout=log_file)
-            subprocess.check_call(["cmake", "--build", ".", "--config", "Release"], stdout=log_file)
+            subprocess.check_call(
+                ["cmake", "--build", ".", "--config", "Release"], stdout=log_file
+            )
 
     else:
         subprocess.check_call(["cmake", ".."])
diff --git a/keopscore/keopscore/windows_compilations/compile_nvrtc_jit.py b/keopscore/keopscore/windows_compilations/compile_nvrtc_jit.py
index 5317cca21..23bd71679 100644
--- a/keopscore/keopscore/windows_compilations/compile_nvrtc_jit.py
+++ b/keopscore/keopscore/windows_compilations/compile_nvrtc_jit.py
@@ -14,31 +14,23 @@ def compile_nvrtc_jit(build_folder):
     keops_dir = Path(find_package_location("keopscore")).parent
     source_file = keops_dir / "binders" / "nvrtc" / "nvrtc_jit.cpp"
 
-
     macros = [
         "-DMAXIDGPU=0",
         "-DMAXTHREADSPERBLOCK0=1024",
         "-DSHAREDMEMPERBLOCK0=49152",
         "-DnvrtcGetTARGET=nvrtcGetCUBIN",
         "-DnvrtcGetTARGETSize=nvrtcGetCUBINSize",
-        "-DARCHTAG=\"sm\""
+        '-DARCHTAG="sm"',
     ]
 
-
     compile(
         source_file=source_file,
         macros=macros,
-        includes= [
-            include_dirs[key] for key in ["keops", "cuda"]
-        ],
-        link_dirs=[
-            lib_dirs[key] for key in ["cuda"]
-        ],
-        links=[
-            lib_names[key] for key in ["cuda", "nvrtc", "cudart"]
-        ],
-        suffix='.dll',
+        includes=[include_dirs[key] for key in ["keops", "cuda"]],
+        link_dirs=[lib_dirs[key] for key in ["cuda"]],
+        links=[lib_names[key] for key in ["cuda", "nvrtc", "cudart"]],
+        suffix=".dll",
         output_dir=build_folder,
         print_cmakelists=False,
-        show_cmake_commands_output=False
+        show_cmake_commands_output=False,
     )
diff --git a/keopscore/keopscore/windows_compilations/compile_pykeops_cpp_module.py b/keopscore/keopscore/windows_compilations/compile_pykeops_cpp_module.py
index cb40f10b6..66e84b94c 100644
--- a/keopscore/keopscore/windows_compilations/compile_pykeops_cpp_module.py
+++ b/keopscore/keopscore/windows_compilations/compile_pykeops_cpp_module.py
@@ -16,19 +16,15 @@ def compile_pykeops_cpp_module(source_file, build_folder):
 
     compile(
         source_file=source_file,
-        includes= [
+        includes=[
             include_dirs["python"],
             include_dirs["pybind11"],
-            include_dirs["keops"]
+            include_dirs["keops"],
         ],
-        link_dirs=[
-            lib_dirs["python"]
-        ],
-        links=[
-            lib_names["python"]
-        ],
-        suffix='.pyd',
+        link_dirs=[lib_dirs["python"]],
+        links=[lib_names["python"]],
+        suffix=".pyd",
         output_dir=build_folder,
         print_cmakelists=False,
-        show_cmake_commands_output=False
+        show_cmake_commands_output=False,
     )
diff --git a/keopscore/keopscore/windows_compilations/compile_pykeops_nvrtc.py b/keopscore/keopscore/windows_compilations/compile_pykeops_nvrtc.py
index fdae10309..7fac03c00 100644
--- a/keopscore/keopscore/windows_compilations/compile_pykeops_nvrtc.py
+++ b/keopscore/keopscore/windows_compilations/compile_pykeops_nvrtc.py
@@ -14,30 +14,22 @@ def compile_pykeops_nvrtc(build_folder):
     pykeops_dir = Path(find_package_location("pykeops")).parent
     source_file = pykeops_dir / "common" / "keops_io" / "pykeops_nvrtc.cpp"
 
-
     macros = [
         "-DMAXIDGPU=0",
         "-DMAXTHREADSPERBLOCK0=1024",
         "-DSHAREDMEMPERBLOCK0=49152",
         "-DnvrtcGetTARGET=nvrtcGetCUBIN",
         "-DnvrtcGetTARGETSize=nvrtcGetCUBINSize",
-        "-DARCHTAG=\"sm\""
+        '-DARCHTAG="sm"',
     ]
 
-
     compile(
         source_file=source_file,
         macros=macros,
-        includes= [
-            include_dirs[key] for key in ["python", "pybind11", "keops", "cuda"]
-        ],
-        link_dirs=[
-            lib_dirs[key] for key in ["python", "cuda"]
-        ],
-        links=[
-            lib_names[key] for key in ["cuda", "nvrtc", "cudart", "python"]
-        ],
-        suffix='.pyd',
+        includes=[include_dirs[key] for key in ["python", "pybind11", "keops", "cuda"]],
+        link_dirs=[lib_dirs[key] for key in ["python", "cuda"]],
+        links=[lib_names[key] for key in ["cuda", "nvrtc", "cudart", "python"]],
+        suffix=".pyd",
         output_dir=build_folder,
         print_cmakelists=False,
         show_cmake_commands_output=False,
diff --git a/keopscore/keopscore/windows_compilations/cuda_detection.py b/keopscore/keopscore/windows_compilations/cuda_detection.py
index 18d11c8bc..5c5d1dbab 100644
--- a/keopscore/keopscore/windows_compilations/cuda_detection.py
+++ b/keopscore/keopscore/windows_compilations/cuda_detection.py
@@ -9,8 +9,10 @@ def detect_cuda_toolkit():
 
         output = {}
 
-        cuda_path = Path(os.environ["CUDA_PATH"]) # base path for cuda installation (including bin, lib, include, etc.)
-        cuda_bin = Path(cuda_path, "bin") # where the dlls are located
+        cuda_path = Path(
+            os.environ["CUDA_PATH"]
+        )  # base path for cuda installation (including bin, lib, include, etc.)
+        cuda_bin = Path(cuda_path, "bin")  # where the dlls are located
 
         if find_library("nvcuda") is not None:
             output["dll_cuda"] = find_library("nvcuda")
@@ -28,7 +30,6 @@ def detect_cuda_toolkit():
         if cuda_include.is_dir():
             output["include_dir"] = str(cuda_include)
 
-
         cuda_libs = Path(cuda_path, "lib", "x64")
         if cuda_libs.is_dir():
             output["lib_dirs"] = str(cuda_libs)
diff --git a/keopscore/keopscore/windows_compilations/detection.py b/keopscore/keopscore/windows_compilations/detection.py
index f745f6dda..57548129b 100644
--- a/keopscore/keopscore/windows_compilations/detection.py
+++ b/keopscore/keopscore/windows_compilations/detection.py
@@ -21,16 +21,15 @@
     keops_available = False
 
 
-
 include_dirs["pybind11"] = pybind11.get_include()
 
 
 # Python
-include_dirs["python"] = sysconfig.get_path('include')
+include_dirs["python"] = sysconfig.get_path("include")
 
 if sys.platform == "win32":
     # On Windows, get the path to the Python DLL
-    python_libs = Path(sysconfig.get_path('include')).parent / "libs"
+    python_libs = Path(sysconfig.get_path("include")).parent / "libs"
 
     # Get the path to the standard library (Lib)
     if python_libs.is_dir():
@@ -41,9 +40,8 @@
             lib_names["python"] = "python" + version
 else:
     # On Unix-like systems, use sysconfig to get the library directory and name
-    lib_dirs["python"] = Path(sysconfig.get_config_var('LIBDIR'))
-    lib_names["python"] = sysconfig.get_config_var('LDLIBRARY')
-
+    lib_dirs["python"] = Path(sysconfig.get_config_var("LIBDIR"))
+    lib_names["python"] = sysconfig.get_config_var("LDLIBRARY")
 
 
 # Cuda
diff --git a/keopscore/keopscore/windows_compilations/utils.py b/keopscore/keopscore/windows_compilations/utils.py
index 320a06009..6a005d08e 100644
--- a/keopscore/keopscore/windows_compilations/utils.py
+++ b/keopscore/keopscore/windows_compilations/utils.py
@@ -27,5 +27,5 @@ def find_package_location(package_name: str) -> str:
     if spec.origin:
         return spec.origin
     else:
-        message =  f"Package '{package_name}' not found."
+        message = f"Package '{package_name}' not found."
         raise ImportError(message)

From 5bd1c0e445324f007096242ced8a1942bc7e6b54 Mon Sep 17 00:00:00 2001
From: Louis Pujol <52413616+Louis-Pujol@users.noreply.github.com>
Date: Sat, 28 Jun 2025 09:45:34 +0200
Subject: [PATCH 05/25] move windows compilmations to an independant package

---
 keopswindows/pyproject.toml                   | 80 +++++++++++++++++++
 .../src/keopswindows}/__init__.py             |  1 +
 .../src/keopswindows}/compile.py              |  0
 .../src/keopswindows}/compile_nvrtc_jit.py    |  1 +
 .../compile_pykeops_cpp_module.py             |  1 +
 .../keopswindows}/compile_pykeops_nvrtc.py    |  1 +
 .../src/keopswindows}/cuda_detection.py       |  2 +
 .../src/keopswindows}/detection.py            |  0
 .../src/keopswindows}/globals.py              |  0
 .../keopswindows}/templates/CMakeLists.txt    |  0
 .../src/keopswindows}/utils.py                |  0
 11 files changed, 86 insertions(+)
 create mode 100644 keopswindows/pyproject.toml
 rename {keopscore/keopscore/windows_compilations => keopswindows/src/keopswindows}/__init__.py (90%)
 rename {keopscore/keopscore/windows_compilations => keopswindows/src/keopswindows}/compile.py (100%)
 rename {keopscore/keopscore/windows_compilations => keopswindows/src/keopswindows}/compile_nvrtc_jit.py (93%)
 rename {keopscore/keopscore/windows_compilations => keopswindows/src/keopswindows}/compile_pykeops_cpp_module.py (92%)
 rename {keopscore/keopscore/windows_compilations => keopswindows/src/keopswindows}/compile_pykeops_nvrtc.py (93%)
 rename {keopscore/keopscore/windows_compilations => keopswindows/src/keopswindows}/cuda_detection.py (93%)
 rename {keopscore/keopscore/windows_compilations => keopswindows/src/keopswindows}/detection.py (100%)
 rename {keopscore/keopscore/windows_compilations => keopswindows/src/keopswindows}/globals.py (100%)
 rename {keopscore/keopscore/windows_compilations => keopswindows/src/keopswindows}/templates/CMakeLists.txt (100%)
 rename {keopscore/keopscore/windows_compilations => keopswindows/src/keopswindows}/utils.py (100%)

diff --git a/keopswindows/pyproject.toml b/keopswindows/pyproject.toml
new file mode 100644
index 000000000..0eeb255e1
--- /dev/null
+++ b/keopswindows/pyproject.toml
@@ -0,0 +1,80 @@
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project]
+name = "keopswindows"
+version = "0.1.0"
+authors = [
+    {name = "Louis Pujol"}
+]
+description = "KeOps on windows: adapters for KeOps to work on Windows"
+requires-python = ">=3.7"
+dependencies = [
+    "cmake",
+]
+
+[project.optional-dependencies]
+dev = [
+    "pytest",
+    "pytest-cov",
+    "pre-commit",
+]
+
+
+[tool.pytest.ini_options]
+minversion = "6.0"
+addopts = [
+        "-ra",
+        "--durations=10",
+        "--showlocals",
+        "--strict-markers",
+        "--strict-config",
+        "--doctest-modules",
+        "--cov=keopswindows",
+        "--cov-report=term-missing",
+        "--cov-report=html",
+        "--cov-report=xml"
+        ]
+
+testpaths = ["src/keopswindows", "tests"]
+
+
+# See: https://learn.scientific-python.org/development/guides/style/#PC190
+[tool.ruff]
+line-length = 79
+
+[tool.ruff.lint]
+extend-select = [
+    "B",        # flake8-bugbear
+    "I",        # isort
+    "ARG",      # flake8-unused-arguments
+    "C4",       # flake8-comprehensions
+    "EM",       # flake8-errmsg
+    "ICN",      # flake8-import-conventions
+    "G",        # flake8-logging-format
+    "PGH",      # pygrep-hooks
+    "PIE",      # flake8-pie
+    "PL",       # pylint
+    "PT",       # flake8-pytest-style
+    "PTH",      # flake8-use-pathlib
+    "RET",      # flake8-return
+    "RUF",      # Ruff-specific
+    "SIM",      # flake8-simplify
+    "T20",      # flake8-print
+    "UP",       # pyupgrade
+    "YTT",      # flake8-2020
+    "EXE",      # flake8-executable
+    "NPY",      # NumPy specific rules
+    "FURB",     # refurb
+    "PYI",      # flake8-pyi
+]
+
+[tool.ruff.lint.per-file-ignores]
+"tests/**" = ["T201", "PLR2004"] # it is ok to print in tests + magic values in comparison
+"examples/**" = ["T201"] # also ok to print in examples
+
+
+[tool.ruff.format]
+indent-style = "space"
+docstring-code-format = true
diff --git a/keopscore/keopscore/windows_compilations/__init__.py b/keopswindows/src/keopswindows/__init__.py
similarity index 90%
rename from keopscore/keopscore/windows_compilations/__init__.py
rename to keopswindows/src/keopswindows/__init__.py
index c8ba3a787..cc7a91321 100644
--- a/keopscore/keopscore/windows_compilations/__init__.py
+++ b/keopswindows/src/keopswindows/__init__.py
@@ -12,6 +12,7 @@
     lib_names,
 )
 from .globals import tmp_dir
+from .cuda_detection import cuda_available
 
 __all__ = [
     "compile",
diff --git a/keopscore/keopscore/windows_compilations/compile.py b/keopswindows/src/keopswindows/compile.py
similarity index 100%
rename from keopscore/keopscore/windows_compilations/compile.py
rename to keopswindows/src/keopswindows/compile.py
diff --git a/keopscore/keopscore/windows_compilations/compile_nvrtc_jit.py b/keopswindows/src/keopswindows/compile_nvrtc_jit.py
similarity index 93%
rename from keopscore/keopscore/windows_compilations/compile_nvrtc_jit.py
rename to keopswindows/src/keopswindows/compile_nvrtc_jit.py
index 23bd71679..ea20ced89 100644
--- a/keopscore/keopscore/windows_compilations/compile_nvrtc_jit.py
+++ b/keopswindows/src/keopswindows/compile_nvrtc_jit.py
@@ -33,4 +33,5 @@ def compile_nvrtc_jit(build_folder):
         output_dir=build_folder,
         print_cmakelists=False,
         show_cmake_commands_output=False,
+        clean_tmp_build_dir=False,
     )
diff --git a/keopscore/keopscore/windows_compilations/compile_pykeops_cpp_module.py b/keopswindows/src/keopswindows/compile_pykeops_cpp_module.py
similarity index 92%
rename from keopscore/keopscore/windows_compilations/compile_pykeops_cpp_module.py
rename to keopswindows/src/keopswindows/compile_pykeops_cpp_module.py
index 66e84b94c..47c8f56ba 100644
--- a/keopscore/keopscore/windows_compilations/compile_pykeops_cpp_module.py
+++ b/keopswindows/src/keopswindows/compile_pykeops_cpp_module.py
@@ -27,4 +27,5 @@ def compile_pykeops_cpp_module(source_file, build_folder):
         output_dir=build_folder,
         print_cmakelists=False,
         show_cmake_commands_output=False,
+        clean_tmp_build_dir=False,
     )
diff --git a/keopscore/keopscore/windows_compilations/compile_pykeops_nvrtc.py b/keopswindows/src/keopswindows/compile_pykeops_nvrtc.py
similarity index 93%
rename from keopscore/keopscore/windows_compilations/compile_pykeops_nvrtc.py
rename to keopswindows/src/keopswindows/compile_pykeops_nvrtc.py
index 7fac03c00..77fe5fb48 100644
--- a/keopscore/keopscore/windows_compilations/compile_pykeops_nvrtc.py
+++ b/keopswindows/src/keopswindows/compile_pykeops_nvrtc.py
@@ -33,4 +33,5 @@ def compile_pykeops_nvrtc(build_folder):
         output_dir=build_folder,
         print_cmakelists=False,
         show_cmake_commands_output=False,
+        clean_tmp_build_dir=False,
     )
diff --git a/keopscore/keopscore/windows_compilations/cuda_detection.py b/keopswindows/src/keopswindows/cuda_detection.py
similarity index 93%
rename from keopscore/keopscore/windows_compilations/cuda_detection.py
rename to keopswindows/src/keopswindows/cuda_detection.py
index 5c5d1dbab..43513c95d 100644
--- a/keopscore/keopscore/windows_compilations/cuda_detection.py
+++ b/keopswindows/src/keopswindows/cuda_detection.py
@@ -2,6 +2,8 @@
 from ctypes.util import find_library
 from pathlib import Path
 
+cuda_available = find_library("nvcuda") is not None
+
 
 def detect_cuda_toolkit():
 
diff --git a/keopscore/keopscore/windows_compilations/detection.py b/keopswindows/src/keopswindows/detection.py
similarity index 100%
rename from keopscore/keopscore/windows_compilations/detection.py
rename to keopswindows/src/keopswindows/detection.py
diff --git a/keopscore/keopscore/windows_compilations/globals.py b/keopswindows/src/keopswindows/globals.py
similarity index 100%
rename from keopscore/keopscore/windows_compilations/globals.py
rename to keopswindows/src/keopswindows/globals.py
diff --git a/keopscore/keopscore/windows_compilations/templates/CMakeLists.txt b/keopswindows/src/keopswindows/templates/CMakeLists.txt
similarity index 100%
rename from keopscore/keopscore/windows_compilations/templates/CMakeLists.txt
rename to keopswindows/src/keopswindows/templates/CMakeLists.txt
diff --git a/keopscore/keopscore/windows_compilations/utils.py b/keopswindows/src/keopswindows/utils.py
similarity index 100%
rename from keopscore/keopscore/windows_compilations/utils.py
rename to keopswindows/src/keopswindows/utils.py

From fdba11fd07bf7bbae3185ae2ae63fb2329422cdb Mon Sep 17 00:00:00 2001
From: Louis Pujol <52413616+Louis-Pujol@users.noreply.github.com>
Date: Fri, 11 Jul 2025 09:53:19 +0200
Subject: [PATCH 06/25] move back windows compability to keopscore

---
 .../windows_compilations}/__init__.py         |   0
 .../windows_compilations}/compile.py          |   0
 .../compile_nvrtc_jit.py                      |   3 +-
 .../compile_pykeops_cpp_module.py             |   0
 .../compile_pykeops_nvrtc.py                  |   3 +-
 .../windows_compilations}/cuda_detection.py   |   0
 .../windows_compilations}/detection.py        |   0
 .../windows_compilations}/globals.py          |   0
 .../templates/CMakeLists.txt                  |   0
 .../keopscore/windows_compilations}/utils.py  |   0
 keopswindows/pyproject.toml                   |  80 ----------
 .../pykeops/common/keops_io/LoadKeOps_cpp.py  | 130 ++++++++++++++-
 .../common/keops_io/pykeops_nvrtc_win.cpp     | 148 ++++++++++++++++++
 13 files changed, 279 insertions(+), 85 deletions(-)
 rename {keopswindows/src/keopswindows => keopscore/keopscore/windows_compilations}/__init__.py (100%)
 rename {keopswindows/src/keopswindows => keopscore/keopscore/windows_compilations}/compile.py (100%)
 rename {keopswindows/src/keopswindows => keopscore/keopscore/windows_compilations}/compile_nvrtc_jit.py (86%)
 rename {keopswindows/src/keopswindows => keopscore/keopscore/windows_compilations}/compile_pykeops_cpp_module.py (100%)
 rename {keopswindows/src/keopswindows => keopscore/keopscore/windows_compilations}/compile_pykeops_nvrtc.py (92%)
 rename {keopswindows/src/keopswindows => keopscore/keopscore/windows_compilations}/cuda_detection.py (100%)
 rename {keopswindows/src/keopswindows => keopscore/keopscore/windows_compilations}/detection.py (100%)
 rename {keopswindows/src/keopswindows => keopscore/keopscore/windows_compilations}/globals.py (100%)
 rename {keopswindows/src/keopswindows => keopscore/keopscore/windows_compilations}/templates/CMakeLists.txt (100%)
 rename {keopswindows/src/keopswindows => keopscore/keopscore/windows_compilations}/utils.py (100%)
 delete mode 100644 keopswindows/pyproject.toml
 create mode 100644 pykeops/pykeops/common/keops_io/pykeops_nvrtc_win.cpp

diff --git a/keopswindows/src/keopswindows/__init__.py b/keopscore/keopscore/windows_compilations/__init__.py
similarity index 100%
rename from keopswindows/src/keopswindows/__init__.py
rename to keopscore/keopscore/windows_compilations/__init__.py
diff --git a/keopswindows/src/keopswindows/compile.py b/keopscore/keopscore/windows_compilations/compile.py
similarity index 100%
rename from keopswindows/src/keopswindows/compile.py
rename to keopscore/keopscore/windows_compilations/compile.py
diff --git a/keopswindows/src/keopswindows/compile_nvrtc_jit.py b/keopscore/keopscore/windows_compilations/compile_nvrtc_jit.py
similarity index 86%
rename from keopswindows/src/keopswindows/compile_nvrtc_jit.py
rename to keopscore/keopscore/windows_compilations/compile_nvrtc_jit.py
index ea20ced89..7227bdd59 100644
--- a/keopswindows/src/keopswindows/compile_nvrtc_jit.py
+++ b/keopscore/keopscore/windows_compilations/compile_nvrtc_jit.py
@@ -12,7 +12,7 @@
 def compile_nvrtc_jit(build_folder):
 
     keops_dir = Path(find_package_location("keopscore")).parent
-    source_file = keops_dir / "binders" / "nvrtc" / "nvrtc_jit.cpp"
+    source_file = keops_dir / "binders" / "nvrtc" / "nvrtc_jit_win.cpp"
 
     macros = [
         "-DMAXIDGPU=0",
@@ -25,6 +25,7 @@ def compile_nvrtc_jit(build_folder):
 
     compile(
         source_file=source_file,
+        project_name="nvrtc_jit",
         macros=macros,
         includes=[include_dirs[key] for key in ["keops", "cuda"]],
         link_dirs=[lib_dirs[key] for key in ["cuda"]],
diff --git a/keopswindows/src/keopswindows/compile_pykeops_cpp_module.py b/keopscore/keopscore/windows_compilations/compile_pykeops_cpp_module.py
similarity index 100%
rename from keopswindows/src/keopswindows/compile_pykeops_cpp_module.py
rename to keopscore/keopscore/windows_compilations/compile_pykeops_cpp_module.py
diff --git a/keopswindows/src/keopswindows/compile_pykeops_nvrtc.py b/keopscore/keopscore/windows_compilations/compile_pykeops_nvrtc.py
similarity index 92%
rename from keopswindows/src/keopswindows/compile_pykeops_nvrtc.py
rename to keopscore/keopscore/windows_compilations/compile_pykeops_nvrtc.py
index 77fe5fb48..e27a09c3e 100644
--- a/keopswindows/src/keopswindows/compile_pykeops_nvrtc.py
+++ b/keopscore/keopscore/windows_compilations/compile_pykeops_nvrtc.py
@@ -12,7 +12,7 @@
 def compile_pykeops_nvrtc(build_folder):
 
     pykeops_dir = Path(find_package_location("pykeops")).parent
-    source_file = pykeops_dir / "common" / "keops_io" / "pykeops_nvrtc.cpp"
+    source_file = pykeops_dir / "common" / "keops_io" / "pykeops_nvrtc_win.cpp"
 
     macros = [
         "-DMAXIDGPU=0",
@@ -25,6 +25,7 @@ def compile_pykeops_nvrtc(build_folder):
 
     compile(
         source_file=source_file,
+        project_name="pykeops_nvrtc",
         macros=macros,
         includes=[include_dirs[key] for key in ["python", "pybind11", "keops", "cuda"]],
         link_dirs=[lib_dirs[key] for key in ["python", "cuda"]],
diff --git a/keopswindows/src/keopswindows/cuda_detection.py b/keopscore/keopscore/windows_compilations/cuda_detection.py
similarity index 100%
rename from keopswindows/src/keopswindows/cuda_detection.py
rename to keopscore/keopscore/windows_compilations/cuda_detection.py
diff --git a/keopswindows/src/keopswindows/detection.py b/keopscore/keopscore/windows_compilations/detection.py
similarity index 100%
rename from keopswindows/src/keopswindows/detection.py
rename to keopscore/keopscore/windows_compilations/detection.py
diff --git a/keopswindows/src/keopswindows/globals.py b/keopscore/keopscore/windows_compilations/globals.py
similarity index 100%
rename from keopswindows/src/keopswindows/globals.py
rename to keopscore/keopscore/windows_compilations/globals.py
diff --git a/keopswindows/src/keopswindows/templates/CMakeLists.txt b/keopscore/keopscore/windows_compilations/templates/CMakeLists.txt
similarity index 100%
rename from keopswindows/src/keopswindows/templates/CMakeLists.txt
rename to keopscore/keopscore/windows_compilations/templates/CMakeLists.txt
diff --git a/keopswindows/src/keopswindows/utils.py b/keopscore/keopscore/windows_compilations/utils.py
similarity index 100%
rename from keopswindows/src/keopswindows/utils.py
rename to keopscore/keopscore/windows_compilations/utils.py
diff --git a/keopswindows/pyproject.toml b/keopswindows/pyproject.toml
deleted file mode 100644
index 0eeb255e1..000000000
--- a/keopswindows/pyproject.toml
+++ /dev/null
@@ -1,80 +0,0 @@
-[build-system]
-requires = ["hatchling"]
-build-backend = "hatchling.build"
-
-[project]
-name = "keopswindows"
-version = "0.1.0"
-authors = [
-    {name = "Louis Pujol"}
-]
-description = "KeOps on windows: adapters for KeOps to work on Windows"
-requires-python = ">=3.7"
-dependencies = [
-    "cmake",
-]
-
-[project.optional-dependencies]
-dev = [
-    "pytest",
-    "pytest-cov",
-    "pre-commit",
-]
-
-
-[tool.pytest.ini_options]
-minversion = "6.0"
-addopts = [
-        "-ra",
-        "--durations=10",
-        "--showlocals",
-        "--strict-markers",
-        "--strict-config",
-        "--doctest-modules",
-        "--cov=keopswindows",
-        "--cov-report=term-missing",
-        "--cov-report=html",
-        "--cov-report=xml"
-        ]
-
-testpaths = ["src/keopswindows", "tests"]
-
-
-# See: https://learn.scientific-python.org/development/guides/style/#PC190
-[tool.ruff]
-line-length = 79
-
-[tool.ruff.lint]
-extend-select = [
-    "B",        # flake8-bugbear
-    "I",        # isort
-    "ARG",      # flake8-unused-arguments
-    "C4",       # flake8-comprehensions
-    "EM",       # flake8-errmsg
-    "ICN",      # flake8-import-conventions
-    "G",        # flake8-logging-format
-    "PGH",      # pygrep-hooks
-    "PIE",      # flake8-pie
-    "PL",       # pylint
-    "PT",       # flake8-pytest-style
-    "PTH",      # flake8-use-pathlib
-    "RET",      # flake8-return
-    "RUF",      # Ruff-specific
-    "SIM",      # flake8-simplify
-    "T20",      # flake8-print
-    "UP",       # pyupgrade
-    "YTT",      # flake8-2020
-    "EXE",      # flake8-executable
-    "NPY",      # NumPy specific rules
-    "FURB",     # refurb
-    "PYI",      # flake8-pyi
-]
-
-[tool.ruff.lint.per-file-ignores]
-"tests/**" = ["T201", "PLR2004"] # it is ok to print in tests + magic values in comparison
-"examples/**" = ["T201"] # also ok to print in examples
-
-
-[tool.ruff.format]
-indent-style = "space"
-docstring-code-format = true
diff --git a/pykeops/pykeops/common/keops_io/LoadKeOps_cpp.py b/pykeops/pykeops/common/keops_io/LoadKeOps_cpp.py
index 7b632f3c9..17e0b26fb 100644
--- a/pykeops/pykeops/common/keops_io/LoadKeOps_cpp.py
+++ b/pykeops/pykeops/common/keops_io/LoadKeOps_cpp.py
@@ -12,6 +12,7 @@
 from pykeops.config import pykeops_cpp_name, python_includes
 
 
+
 class LoadKeOps_cpp_class(LoadKeOps):
     def __init__(self, *args, fast_init=False):
         super().__init__(*args, fast_init=fast_init)
@@ -33,7 +34,16 @@ def init_phase1(self):
                 flush=True,
                 end="",
             )
-            KeOps_OS_Run(compile_command)
+
+            if os.name == "nt":
+                from keopscore.windows_compilations import compile_pykeops_cpp_module
+
+                compile_pykeops_cpp_module(
+                    source_file=srcname,
+                    build_folder=get_build_folder(),
+                )
+            else:
+                KeOps_OS_Run(compile_command)
             pyKeOps_Message("OK", use_tag=False, flush=True)
 
     def init_phase2(self):
@@ -70,7 +80,8 @@ def call_keops(self, nx, ny):
         )
 
     def get_pybind11_code(self):
-        return f"""
+        if os.name != "nt":
+            return f"""
 #include "{self.params.source_name}"
 
 #include <pybind11/pybind11.h>
@@ -152,7 +163,7 @@ def get_pybind11_code(self):
     }}
 
 
-    return launch_keops_cpu_{self.params.tag}< TYPE >(dimY,
+    return launch_keops_{self.params.tag}_cpu< TYPE >(dimY,
                                                       nx,
                                                       ny,
                                                       tagI,
@@ -180,6 +191,119 @@ def get_pybind11_code(self):
     m.def("launch_pykeops_cpu", &launch_pykeops_{self.params.tag}_cpu < {cpp_dtype[self.params.dtype]} >, "Entry point to keops.");
 }}                     
             """
+        
+        else:
+            return f"""
+#include "{self.params.source_name}"
+
+#include <pybind11/pybind11.h>
+namespace py = pybind11;
+
+template < typename TYPE >
+int launch_pykeops_{self.params.tag}_cpu(int dimY, int nx, int ny,
+                                         int tagI, int tagZero, int use_half,
+                                         int dimred,
+                                         int use_chunk_mode,
+                                         py::tuple py_indsi, py::tuple py_indsj, py::tuple py_indsp,
+                                         int dimout,
+                                         py::tuple py_dimsx, py::tuple py_dimsy, py::tuple py_dimsp,
+                                         py::tuple py_ranges,
+                                         py::tuple py_shapeout,
+                                         long out_void,
+                                         py::tuple py_arg,
+                                         py::tuple py_argshape){{
+
+    /*------------------------------------*/
+    /*         Cast input args            */
+    /*------------------------------------*/
+
+    std::vector< int > indsi_v(py_indsi.size());
+    for (auto i = 0; i < py_indsi.size(); i++)
+        indsi_v[i] = py::cast< int >(py_indsi[i]);
+
+
+    std::vector< int > indsj_v(py_indsj.size());
+    for (auto i = 0; i < py_indsj.size(); i++)
+        indsj_v[i] = py::cast< int >(py_indsj[i]);
+
+
+    std::vector< int > indsp_v(py_indsp.size());
+    for (auto i = 0; i < py_indsp.size(); i++)
+        indsp_v[i] = py::cast< int >(py_indsp[i]);
+
+
+    std::vector< int > dimsx_v(py_dimsx.size());
+    for (auto i = 0; i < py_dimsx.size(); i++)
+        dimsx_v[i] = py::cast< int >(py_dimsx[i]);
+
+
+    std::vector< int > dimsy_v(py_dimsy.size());
+    for (auto i = 0; i < py_dimsy.size(); i++)
+        dimsy_v[i] = py::cast< int >(py_dimsy[i]);
+        
+    
+    std::vector< int > dimsp_v(py_dimsp.size());
+    for (auto i = 0; i < py_dimsp.size(); i++)
+        dimsp_v[i] = py::cast< int >(py_dimsp[i]);
+
+
+    // Cast the ranges arrays
+    std::vector< int* > ranges_v(py_ranges.size());
+    for (int i = 0; i < py_ranges.size(); i++)
+        ranges_v[i] = (int*) py::cast< long >(py_ranges[i]);
+    int **ranges = (int**) ranges_v.data();
+    
+    std::vector< int > shapeout_v(py_shapeout.size());
+    for (auto i = 0; i < py_shapeout.size(); i++)
+        shapeout_v[i] = py::cast< int >(py_shapeout[i]);
+    
+    TYPE *out = (TYPE*) out_void;
+    // std::cout << "out_ptr : " << (long) out << std::endl;
+    
+    std::vector< TYPE* > arg_v(py_arg.size());
+    for (int i = 0; i < py_arg.size(); i++)
+        arg_v[i] = (TYPE*) py::cast< long >(py_arg[i]);
+    TYPE **arg = (TYPE**) arg_v.data();
+    
+    std::vector< std::vector< int > > argshape_v(py_argshape.size());
+    for (auto i = 0; i < py_argshape.size(); i++){{
+        py::tuple tmp = py_argshape[i];
+        std::vector< int > tmp_v(tmp.size());
+        for (auto j =0; j < tmp.size(); j++)
+            tmp_v[j] = py::cast< int >(tmp[j]);
+        argshape_v[i] = tmp_v;
+    }}
+
+
+    return launch_keops_cpu_{self.params.tag}< TYPE >(dimY,
+                                                      nx,
+                                                      ny,
+                                                      tagI,
+                                                      tagZero,
+                                                      use_half,
+                                                      dimred,
+                                                      use_chunk_mode,
+                                                      indsi_v,
+                                                      indsj_v,
+                                                      indsp_v,
+                                                      dimout,
+                                                      dimsx_v,
+                                                      dimsy_v,
+                                                      dimsp_v,
+                                                      ranges,
+                                                      shapeout_v,
+                                                      out,
+                                                      arg,
+                                                      argshape_v);
+
+}}
+
+PYBIND11_MODULE(pykeops_cpp_{self.params.tag}, m) {{
+    m.doc() = "pyKeOps: KeOps for pytorch through pybind11 (pytorch flavour).";
+    m.def("launch_pykeops_cpu", &launch_pykeops_{self.params.tag}_cpu < {cpp_dtype[self.params.dtype]} >, "Entry point to keops.");
+}}                     
+            """.replace("long", "int64_t")
+
 
 
 LoadKeOps_cpp = Cache_partial(
diff --git a/pykeops/pykeops/common/keops_io/pykeops_nvrtc_win.cpp b/pykeops/pykeops/common/keops_io/pykeops_nvrtc_win.cpp
new file mode 100644
index 000000000..20d8c3d3b
--- /dev/null
+++ b/pykeops/pykeops/common/keops_io/pykeops_nvrtc_win.cpp
@@ -0,0 +1,148 @@
+// g++ -std=c++11  -shared -fPIC -O3 -fpermissive -lcuda -lnvrtc  -L/usr/lib -L/opt/cuda/lib64 -L/opt/cuda/targets/x86_64-linux/lib/ -I/opt/cuda/targets/x86_64-linux/include/ -I /home/bcharlier/projets/keops/keops/keopscore/ -I/usr/include/python3.10/ -DnvrtcGetTARGET=nvrtcGetCUBIN -DnvrtcGetTARGETSize=nvrtcGetCUBINSize -DARCHTAG=\"sm\"  keops_io.cpp -o keops_io.cpython-310-x86_64-linux-gnu.so
+
+
+#include <binders/nvrtc/keops_nvrtc_win.cpp>
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+template< typename TYPE >
+class KeOps_module_python : public KeOps_module< TYPE > {
+public:
+
+    using KeOps_module< TYPE >::KeOps_module;
+
+
+    int operator()(int tagHostDevice, int dimY, int nx, int ny,
+                   int tagI, int tagZero, int use_half,
+                   int tag1D2D, int dimred,
+                   int cuda_block_size, int use_chunk_mode,
+                   py::tuple py_indsi, py::tuple py_indsj, py::tuple py_indsp,
+                   int dimout,
+                   py::tuple py_dimsx, py::tuple py_dimsy, py::tuple py_dimsp,
+                   py::tuple py_ranges,
+                   py::tuple py_shapeout,
+                   py::tuple out_void,
+                   py::tuple py_arg,
+                   py::tuple py_argshape
+    ) {
+
+        /*------------------------------------*/
+        /*         Cast input args            */
+        /*------------------------------------*/
+
+        std::vector< int > indsi_v(py_indsi.size());
+        for (auto i = 0; i < py_indsi.size(); i++)
+            indsi_v[i] = py::cast< int >(py_indsi[i]);
+
+
+        std::vector< int > indsj_v(py_indsj.size());
+        for (auto i = 0; i < py_indsj.size(); i++)
+            indsj_v[i] = py::cast< int >(py_indsj[i]);
+
+
+        std::vector< int > indsp_v(py_indsp.size());
+        for (auto i = 0; i < py_indsp.size(); i++)
+            indsp_v[i] = py::cast< int >(py_indsp[i]);
+
+
+        std::vector< int > dimsx_v(py_dimsx.size());
+        for (auto i = 0; i < py_dimsx.size(); i++)
+            dimsx_v[i] = py::cast< int >(py_dimsx[i]);
+
+
+        std::vector< int > dimsy_v(py_dimsy.size());
+        for (auto i = 0; i < py_dimsy.size(); i++)
+            dimsy_v[i] = py::cast< int >(py_dimsy[i]);
+
+
+        std::vector< int > dimsp_v(py_dimsp.size());
+        for (auto i = 0; i < py_dimsp.size(); i++)
+            dimsp_v[i] = py::cast< int >(py_dimsp[i]);
+
+
+        // Cast the ranges arrays
+        std::vector< int * > ranges_v(py_ranges.size());
+        for (int i = 0; i < py_ranges.size(); i++)
+            ranges_v[i] = (int *) py::cast< int64_t >(py_ranges[i]);
+        int **ranges = (int **) ranges_v.data();
+
+        // for (auto i: ranges_v)
+        //    std::cout << " " <<  (long) i << " ";
+        // std::cout << std::endl;
+
+        //for (auto i=0; i<7; i++)
+        //   std::cout << " " <<  (long) ranges[i] << " ";
+        //std::cout << std::endl;
+
+        std::vector< int > shapeout_v(py_shapeout.size());
+        for (auto i = 0; i < py_shapeout.size(); i++)
+            shapeout_v[i] = py::cast< int >(py_shapeout[i]);
+
+        TYPE *out = (TYPE *)py::cast< int64_t >(out_void[0]);
+        // std::cout << "out_ptr : " << (long) out << std::endl;
+
+        std::vector < TYPE * > arg_v(py_arg.size());
+        for (int i = 0; i < py_arg.size(); i++)
+            arg_v[i] = (TYPE *) py::cast< int64_t >(py_arg[i]);
+        TYPE **arg = (TYPE **) arg_v.data();
+
+        std::vector <std::vector< int >> argshape_v(py_argshape.size());
+        for (auto i = 0; i < py_argshape.size(); i++) {
+            py::tuple tmp = py_argshape[i];
+            std::vector< int > tmp_v(tmp.size());
+            for (auto j = 0; j < tmp.size(); j++)
+                tmp_v[j] = py::cast< int >(tmp[j]);
+            argshape_v[i] = tmp_v;
+        }
+
+//        for (auto i : argshape_v)
+//            for (auto j : i)
+//                std::cout << j << " " ;
+
+        return KeOps_module< TYPE >::launch_kernel(tagHostDevice,
+                                                   dimY,
+                                                   nx,
+                                                   ny,
+                                                   tagI,
+                                                   tagZero,
+                                                   use_half,
+                                                   tag1D2D,
+                                                   dimred,
+                                                   cuda_block_size,
+                                                   use_chunk_mode,
+                                                   indsi_v,
+                                                   indsj_v,
+                                                   indsp_v,
+                                                   dimout,
+                                                   dimsx_v,
+                                                   dimsy_v,
+                                                   dimsp_v,
+                                                   ranges,
+                                                   shapeout_v,
+                                                   out,
+                                                   arg,
+                                                   argshape_v);
+    }
+
+};
+/////////////////////////////////////////////////////////////////////////////////
+//                    PyBind11 entry point                                     //
+/////////////////////////////////////////////////////////////////////////////////
+
+
+PYBIND11_MODULE(pykeops_nvrtc, m) {
+m.doc() = "pyKeOps: KeOps for pytorch through pybind11 (pytorch flavour).";
+
+py::class_< KeOps_module_python< float > >(m, "KeOps_module_float")
+.def(py::init<int, int, const char *>())
+.def("__call__", &KeOps_module_python< float >::operator());
+
+py::class_< KeOps_module_python< double > >(m, "KeOps_module_double")
+.def(py::init<int, int, const char *>())
+.def("__call__", &KeOps_module_python< double >::operator());
+
+py::class_< KeOps_module_python< half2 > >(m, "KeOps_module_half2")
+.def(py::init<int, int, const char *>())
+.def("__call__", &KeOps_module_python< half2 >::operator());
+}

From b3def29dbf0126e0d0775cbf767bc80323bf7789 Mon Sep 17 00:00:00 2001
From: Louis Pujol <52413616+Louis-Pujol@users.noreply.github.com>
Date: Fri, 11 Jul 2025 09:54:22 +0200
Subject: [PATCH 07/25] add windows specific cpp files

---
 keopscore/keopscore/binders/LinkCompile.py    |   6 +-
 .../binders/nvrtc/Gpu_link_compile.py         |  66 +-
 .../binders/nvrtc/keops_nvrtc_win.cpp         | 604 ++++++++++++++++++
 .../keopscore/binders/nvrtc/nvrtc_jit_win.cpp | 114 ++++
 keopscore/keopscore/config/__init__.py        |   6 +-
 keopscore/keopscore/config/cuda_windows.py    | 207 ++++++
 keopscore/keopscore/include/CudaSizes_win.h   |  96 +++
 keopscore/keopscore/include/Ranges_win.h      |  99 +++
 keopscore/keopscore/include/Sizes_win.h       | 402 ++++++++++++
 .../keopscore/include/ranges_utils_win.h      |  82 +++
 keopscore/keopscore/include/utils_pe_win.h    | 100 +++
 .../common/keops_io/LoadKeOps_nvrtc.py        |   8 +-
 12 files changed, 1767 insertions(+), 23 deletions(-)
 create mode 100644 keopscore/keopscore/binders/nvrtc/keops_nvrtc_win.cpp
 create mode 100644 keopscore/keopscore/binders/nvrtc/nvrtc_jit_win.cpp
 create mode 100644 keopscore/keopscore/config/cuda_windows.py
 create mode 100644 keopscore/keopscore/include/CudaSizes_win.h
 create mode 100644 keopscore/keopscore/include/Ranges_win.h
 create mode 100644 keopscore/keopscore/include/Sizes_win.h
 create mode 100644 keopscore/keopscore/include/ranges_utils_win.h
 create mode 100644 keopscore/keopscore/include/utils_pe_win.h

diff --git a/keopscore/keopscore/binders/LinkCompile.py b/keopscore/keopscore/binders/LinkCompile.py
index aae5d70ea..e561745ce 100644
--- a/keopscore/keopscore/binders/LinkCompile.py
+++ b/keopscore/keopscore/binders/LinkCompile.py
@@ -84,7 +84,11 @@ def read_info(self):
     def write_code(self):
         # write the generated code in the source file ; this is used as a subfunction of compile_code
         f = open(self.gencode_file, "w")
-        f.write(self.code)
+        if os.name == "nt":
+            f.write(self.code.replace("signed long int", "int"))
+        else:
+            f.write(self.code)
+            
         f.close()
 
     def generate_code(self):
diff --git a/keopscore/keopscore/binders/nvrtc/Gpu_link_compile.py b/keopscore/keopscore/binders/nvrtc/Gpu_link_compile.py
index ccd6eb00d..e0b90774f 100644
--- a/keopscore/keopscore/binders/nvrtc/Gpu_link_compile.py
+++ b/keopscore/keopscore/binders/nvrtc/Gpu_link_compile.py
@@ -1,6 +1,5 @@
 import os
 from ctypes import create_string_buffer, CDLL, c_int
-from os import RTLD_LAZY
 import sysconfig
 from os.path import join
 
@@ -34,10 +33,16 @@
 
 
 def jit_compile_dll():
-    return os.path.join(
-        build_folder,
-        "nvrtc_jit" + sysconfig.get_config_var("SHLIB_SUFFIX"),
-    )
+    if os.name == "nt":
+        return os.path.join(
+            build_folder,
+            "nvrtc_jit.dll"
+        )
+    else:
+        return os.path.join(
+            build_folder,
+            "nvrtc_jit" + sysconfig.get_config_var("SHLIB_SUFFIX"),
+        )
 
 
 class Gpu_link_compile(LinkCompile):
@@ -61,7 +66,10 @@ def __init__(self):
             self.low_level_code_prefix + self.gencode_filename,
         ).encode("utf-8")
 
-        self.my_c_dll = CDLL(jit_compile_dll(), mode=RTLD_LAZY)
+        if os.name != "nt":
+            self.my_c_dll = CDLL(jit_compile_dll(), mode=os.RTLD_LAZY)
+        else:
+            self.my_c_dll = CDLL(jit_compile_dll())
         # actual dll to be called is the jit binary, TODO: check if this is relevent
         self.true_dllname = jit_binary
         # file to check for existence to detect compilation is needed
@@ -75,16 +83,28 @@ def generate_code(self):
         self.write_code()
         # we execute the main dll, passing the code as argument, and the name of the low level code file to save the assembly instructions
 
-        res = self.my_c_dll.Compile(
-            create_string_buffer(self.low_level_code_file),
-            create_string_buffer(self.code.encode("utf-8")),
-            c_int(self.use_half),
-            c_int(self.use_fast_math),
-            c_int(self.device_id),
-            create_string_buffer(
-                (custom_cuda_include_fp16_path() + os.path.sep).encode("utf-8")
-            ),
-        )
+        breakpoint()
+        if os.name != "nt":
+            res = self.my_c_dll.Compile(
+                create_string_buffer(self.low_level_code_file),
+                create_string_buffer(self.code.encode("utf-8")),
+                c_int(self.use_half),
+                c_int(self.use_fast_math),
+                c_int(self.device_id),
+                create_string_buffer(
+                    (custom_cuda_include_fp16_path() + os.path.sep).encode("utf-8")
+                ),
+            )
+        else:
+            res = self.my_c_dll.Compile(
+                create_string_buffer(self.low_level_code_file),
+                create_string_buffer(self.code.encode("utf-8")),
+                c_int(self.use_half),
+                c_int(self.device_id),
+                create_string_buffer(
+                    (custom_cuda_include_fp16_path() + os.path.sep).encode("utf-8")
+                ),
+            )
         if res != 0:
             KeOps_Error(
                 f"Error when compiling formula (error in nvrtcCompileProgram, nvrtcResult={res})"
@@ -116,8 +136,14 @@ def get_compile_command(
     @staticmethod
     def compile_jit_compile_dll():
         KeOps_Message("Compiling cuda jit compiler engine ... ", flush=True, end="")
-        command = Gpu_link_compile.get_compile_command(
-            sourcename=jit_compile_src, dllname=jit_compile_dll()
-        )
-        KeOps_OS_Run(command)
+        if os.name == "nt":
+            from ...windows_compilations import compile_nvrtc_jit
+
+            compile_nvrtc_jit(build_folder=build_folder)
+        else:
+            command = Gpu_link_compile.get_compile_command(
+                sourcename=jit_compile_src, dllname=jit_compile_dll()
+            )
+            KeOps_OS_Run(command)
+
         KeOps_Message("OK", use_tag=False, flush=True)
diff --git a/keopscore/keopscore/binders/nvrtc/keops_nvrtc_win.cpp b/keopscore/keopscore/binders/nvrtc/keops_nvrtc_win.cpp
new file mode 100644
index 000000000..017201363
--- /dev/null
+++ b/keopscore/keopscore/binders/nvrtc/keops_nvrtc_win.cpp
@@ -0,0 +1,604 @@
+
+// nvcc -shared -Xcompiler -fPIC -lnvrtc -lcuda keops_nvrtc.cu -o keops_nvrtc.so
+// g++ --verbose -L/opt/cuda/lib64 -L/opt/cuda/targets/x86_64-linux/lib/ -I/opt/cuda/targets/x86_64-linux/include/ -I../../include -shared -fPIC -lcuda -lnvrtc -fpermissive -DMAXIDGPU=0 -DMAXTHREADSPERBLOCK0=1024 -DSHAREDMEMPERBLOCK0=49152 -DnvrtcGetTARGET=nvrtcGetCUBIN -DnvrtcGetTARGETSize=nvrtcGetCUBINSize -DARCHTAG=\"sm\" keops_nvrtc.cpp -o keops_nvrtc.so
+// g++ -std=c++11  -shared -fPIC -O3 -fpermissive -L /usr/lib -L /opt/cuda/lib64 -lcuda -lnvrtc -DnvrtcGetTARGET=nvrtcGetCUBIN -DnvrtcGetTARGETSize=nvrtcGetCUBINSize -DARCHTAG=\"sm\"  -I/home/bcharlier/projets/keops/keops/keops/include -I/opt/cuda/include -I/usr/include/python3.10/ -DMAXIDGPU=0 -DMAXTHREADSPERBLOCK0=1024 -DSHAREDMEMPERBLOCK0=49152  /home/bcharlier/projets/keops/keops/keops/binders/nvrtc/keops_nvrtc.cpp -o keops_nvrtc.cpython-310-x86_64-linux-gnu.so
+
+#include <nvrtc.h>
+#include <cuda.h>
+#include <stdio.h>
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <stdarg.h>
+#include <vector>
+#include <numeric>
+//#include <ctime>
+
+#define C_CONTIGUOUS 1
+#define USE_HALF 0
+
+#include "include/Sizes_win.h"
+#include "include/Ranges_win.h"
+#include "include/utils_pe_win.h"
+#include "include/ranges_utils_win.h"
+
+
+#include "include/CudaSizes_win.h"
+#include <cuda_fp16.h>
+
+
+int *build_offset_tables(int nbatchdims, int *shapes, int nblocks, int *lookup_h,
+                         const std::vector< int > &indsi,
+                         const std::vector< int > &indsj,
+                         const std::vector< int > &indsp,
+                         int tagJ) {
+
+    int sizei = indsi.size();
+    int sizej = indsj.size();
+    int sizep = indsp.size();
+
+    // Support for broadcasting over batch dimensions =============================================
+
+    int sizevars = sizei + sizej + sizep;
+
+    // Separate and store the shapes of the "i" and "j" variables + parameters --------------
+    //
+    // shapes is an array of size (1+nargs)*(nbatchdims+3), which looks like:
+    // [ A, .., B, M, N, D_out]  -> output
+    // [ A, .., B, M, 1, D_1  ]  -> "i" variable
+    // [ A, .., B, 1, N, D_2  ]  -> "j" variable
+    // [ A, .., B, 1, 1, D_3  ]  -> "parameter"
+    // [ A, .., 1, M, 1, D_4  ]  -> N.B.: we support broadcasting on the batch dimensions!
+    // [ 1, .., 1, M, 1, D_5  ]  ->      (we'll just ask users to fill in the shapes with *explicit* ones)
+
+    //int shapes_i[sizei * (nbatchdims + 1)], shapes_j[sizej * (nbatchdims + 1)], shapes_p[sizep * (nbatchdims + 1)];
+    std::vector<int> shapes_i(sizei * (nbatchdims + 1));
+    std::vector<int> shapes_j(sizej * (nbatchdims + 1));
+    std::vector<int> shapes_p(sizep * (nbatchdims + 1));
+
+    // First, we fill shapes_i with the "relevant" shapes of the "i" variables,
+    // making it look like, say:
+    // [ A, .., B, M]
+    // [ A, .., 1, M]
+    // [ A, .., A, M]
+    // Then, we do the same for shapes_j, but with "N" instead of "M".
+    // And finally for the parameters, with "1" instead of "M".
+    fill_shapes(nbatchdims, shapes, shapes_i.data(), shapes_j.data(), shapes_p.data(), tagJ, indsi, indsj, indsp);
+
+    int tagIJ = tagJ; // 1 if the reduction is made "over j", 0 if it is made "over i"
+	int M = shapes[nbatchdims], N = shapes[nbatchdims + 1];
+
+    // We create a lookup table, "offsets", of shape (nblocks, SIZEVARS) --------
+    int *offsets_d = NULL;
+
+	//int offsets_h[nblocks * sizevars];
+    std::vector<int> offsets_h(nblocks * sizevars);
+
+    for (int k = 0; k < nblocks; k++) {
+        int range_id = (int) lookup_h[3 * k];
+        int start_x = tagIJ ? range_id * M : range_id * N;
+        int start_y = tagIJ ? range_id * N : range_id * M;
+
+        int patch_offset = (int) (lookup_h[3 * k + 1] - start_x);
+
+        vect_broadcast_index(start_x, nbatchdims, sizei, shapes, shapes_i.data(), offsets_h.data() + k * sizevars, patch_offset);
+        vect_broadcast_index(start_y, nbatchdims, sizej, shapes, shapes_j.data(), offsets_h.data() + k * sizevars + sizei);
+        vect_broadcast_index(range_id, nbatchdims, sizep, shapes, shapes_p.data(), offsets_h.data() + k * sizevars + sizei + sizej);
+    }
+
+    CUDA_SAFE_CALL(cuMemAlloc((CUdeviceptr * ) & offsets_d, sizeof(int) * nblocks * sizevars));
+    CUDA_SAFE_CALL(cuMemcpyHtoD((CUdeviceptr) offsets_d, offsets_h.data(), sizeof(int) * nblocks * sizevars));
+
+    return offsets_d;
+}
+
+
+void range_preprocess_from_device(int &nblocks, int tagI, int nranges_x, int nranges_y, int **castedranges,
+                                  int nbatchdims, int *&slices_x_d, int *&ranges_y_d,
+                                  int *&lookup_d, int *&offsets_d, int blockSize_x,
+                                  const std::vector< int > &indsi,
+                                  const std::vector< int > &indsj,
+                                  const std::vector< int > &indsp,
+                                  int *shapes) {
+
+    // Ranges pre-processing... ==================================================================
+
+    // N.B.: In the following code, we assume that the x-ranges do not overlap.
+    //       Otherwise, we'd have to assume that DIMRED == DIMOUT
+    //       or allocate a buffer of size nx * DIMRED. This may be done in the future.
+    // Cf. reduction.h:
+    //    FUN::tagJ = 1 for a reduction over j, result indexed by i
+    //    FUN::tagJ = 0 for a reduction over i, result indexed by j
+
+    int tagJ = 1 - tagI;
+    int nranges = tagJ ? nranges_x : nranges_y;
+
+    int *ranges_x = tagJ ? castedranges[0] : castedranges[3];
+    int *slices_x = tagJ ? castedranges[1] : castedranges[4];
+    int *ranges_y = tagJ ? castedranges[2] : castedranges[5];
+
+	std::vector<int> ranges_x_h_arr(2 * nranges);
+    int* ranges_x_h;
+
+    // The code below needs a pointer to ranges_x on *host* memory,  -------------------
+    // as well as pointers to slices_x and ranges_y on *device* memory.
+    // -> Depending on the "ranges" location, we'll copy ranges_x *or* slices_x and ranges_y
+    //    to the appropriate memory:
+    bool ranges_on_device = (nbatchdims == 0);
+    // N.B.: We only support Host ranges with Device data when these ranges were created
+    //       to emulate block-sparse reductions.
+
+    if (ranges_on_device) {  // The ranges are on the device
+        ranges_x_h = &ranges_x_h_arr[0];
+        // Send data from device to host.
+        cuMemcpyDtoH(ranges_x_h, (CUdeviceptr) ranges_x, sizeof(int) * 2 * nranges);
+        slices_x_d = slices_x;
+        ranges_y_d = ranges_y;
+    } else {  // The ranges are on host memory; this is typically what happens with **batch processing**,
+        // with ranges generated by keops_io.h:
+        ranges_x_h = ranges_x;
+        // Copy "slices_x" to the device:
+        CUDA_SAFE_CALL(cuMemAlloc((CUdeviceptr * ) & slices_x_d, sizeof(int) * nranges));
+        CUDA_SAFE_CALL(cuMemcpyHtoD((CUdeviceptr) slices_x_d, slices_x, sizeof(int) * nranges));
+
+        // Copy "redranges_y" to the device: with batch processing, we KNOW that they have the same shape as ranges_x
+        CUDA_SAFE_CALL(cuMemAlloc((CUdeviceptr * ) & ranges_y_d, sizeof(int) * 2 * nranges));
+        CUDA_SAFE_CALL(cuMemcpyHtoD((CUdeviceptr) ranges_y_d, ranges_y, sizeof(int) * 2 * nranges));
+    }
+
+    // Computes the number of blocks needed ---------------------------------------------
+    nblocks = 0;
+    int len_range = 0;
+    for (int i = 0; i < nranges; i++) {
+        len_range = ranges_x_h[2 * i + 1] - ranges_x_h[2 * i];
+        nblocks += (len_range / blockSize_x) + (len_range % blockSize_x == 0 ? 0 : 1);
+    }
+
+    // Create a lookup table for the blocks --------------------------------------------
+    std::vector<int> lookup_h(3 * nblocks);
+    int index = 0;
+
+    for (int i = 0; i < nranges; i++) {
+        len_range = ranges_x_h[2 * i + 1] - ranges_x_h[2 * i];
+        for (int j = 0; j < len_range; j += blockSize_x) {
+            lookup_h[3 * index] = i;
+            lookup_h[3 * index + 1] = ranges_x_h[2 * i] + j;
+            lookup_h[3 * index + 2] = ranges_x_h[2 * i] + j + std::min((int) blockSize_x, len_range - j);
+            index++;
+        }
+    }
+
+    // Load the table on the device -----------------------------------------------------
+    CUDA_SAFE_CALL(cuMemAlloc((CUdeviceptr * ) &lookup_d, sizeof(int) * 3 * nblocks));
+    CUDA_SAFE_CALL(cuMemcpyHtoD((CUdeviceptr) lookup_d, lookup_h.data(), sizeof(int) * 3 * nblocks));
+
+
+    // Support for broadcasting over batch dimensions =============================================
+
+    // We create a lookup table, "offsets", of shape (nblock, SIZEVARS):
+
+    if (nbatchdims > 0) {
+        offsets_d = build_offset_tables(nbatchdims, shapes, nblocks, lookup_h.data(),
+                                        indsi, indsj, indsp, tagJ);
+    }
+
+
+}
+
+
+void
+range_preprocess_from_host(int &nblocks, int tagI, int nranges_x, int nranges_y, int nredranges_x, int nredranges_y,
+                           int **castedranges,
+                           int nbatchdims, int *&slices_x_d, int *&ranges_y_d,
+                           int *&lookup_d, int *&offsets_d, int blockSize_x,
+                           const std::vector< int > &indsi,
+                           const std::vector< int > &indsj,
+                           const std::vector< int > &indsp,
+                           int *shapes) {
+
+    // Ranges pre-processing... ==================================================================
+
+    // N.B.: In the following code, we assume that the x-ranges do not overlap.
+    //       Otherwise, we'd have to assume that DIMRED == DIMOUT
+    //       or allocate a buffer of size nx * DIMRED. This may be done in the future.
+    // Cf. reduction.h:
+    //    FUN::tagJ = 1 for a reduction over j, result indexed by i
+    //    FUN::tagJ = 0 for a reduction over i, result indexed by j
+
+    int tagJ = 1 - tagI;
+    int nranges = tagJ ? nranges_x : nranges_y;
+    int nredranges = tagJ ? nredranges_y : nredranges_x;
+
+    int *ranges_x = tagJ ? castedranges[0] : castedranges[3];
+    int *slices_x = tagJ ? castedranges[1] : castedranges[4];
+    int *ranges_y = tagJ ? castedranges[2] : castedranges[5];
+
+    // Computes the number of blocks needed ---------------------------------------------
+    nblocks = 0;
+    int len_range = 0;
+    for (int i = 0; i < nranges; i++) {
+        len_range = ranges_x[2 * i + 1] - ranges_x[2 * i];
+        nblocks += (len_range / blockSize_x) + (len_range % blockSize_x == 0 ? 0 : 1);
+    }
+
+    // Create a lookup table for the blocks --------------------------------------------
+    std::vector<int> lookup_h(3 * nblocks);
+    int index = 0;
+
+    for (int i = 0; i < nranges; i++) {
+        len_range = ranges_x[2 * i + 1] - ranges_x[2 * i];
+        for (int j = 0; j < len_range; j += blockSize_x) {
+            lookup_h[3 * index] = i;
+            lookup_h[3 * index + 1] = ranges_x[2 * i] + j;
+            lookup_h[3 * index + 2] = ranges_x[2 * i] + j + std::min((int) blockSize_x, len_range - j);
+            index++;
+        }
+    }
+
+    // Load the table on the device -----------------------------------------------------
+    CUDA_SAFE_CALL(cuMemAlloc((CUdeviceptr * ) & lookup_d, sizeof(int) * 3 * nblocks));
+    CUDA_SAFE_CALL(cuMemcpyHtoD((CUdeviceptr) lookup_d, lookup_h.data(), sizeof(int) * 3 * nblocks));
+
+    // Send data from host to device:
+    CUDA_SAFE_CALL(cuMemAlloc((CUdeviceptr * ) & slices_x_d, sizeof(int) * 2 * nranges));
+    CUDA_SAFE_CALL(cuMemcpyHtoD((CUdeviceptr) slices_x_d, slices_x, sizeof(int) * 2 * nranges));
+
+    CUDA_SAFE_CALL(cuMemAlloc((CUdeviceptr * ) & ranges_y_d, sizeof(int) * 2 * nredranges));
+    CUDA_SAFE_CALL(cuMemcpyHtoD((CUdeviceptr) ranges_y_d, ranges_y, sizeof(int) * 2 * nredranges));
+
+
+    // Support for broadcasting over batch dimensions =============================================
+
+    // We create a lookup table, "offsets", of shape (nblock, SIZEVARS):
+
+    if (nbatchdims > 0) {
+        offsets_d = build_offset_tables(nbatchdims, shapes, nblocks, lookup_h.data(),
+                                        indsi, indsj, indsp, tagJ);
+    }
+
+
+}
+
+
+template< typename TYPE >
+class KeOps_module {
+public :
+
+    CUdevice cuDevice;
+    CUcontext ctx;
+    CUmodule module;
+    char *target;
+    CUdeviceptr buffer;
+    int nargs;
+
+    void SetContext() {
+        CUcontext current_ctx;
+        CUDA_SAFE_CALL_NO_EXCEPTION(cuCtxGetCurrent(&current_ctx));
+        if (current_ctx != ctx)
+            CUDA_SAFE_CALL_NO_EXCEPTION(cuCtxPushCurrent(ctx));
+        CUDA_SAFE_CALL_NO_EXCEPTION(cuCtxGetCurrent(&current_ctx));
+    }
+
+
+    void Read_Target(const char *target_file_name) {
+        std::ifstream rf(target_file_name, std::ifstream::binary);
+        size_t targetSize;
+        rf.read((char *) &targetSize, sizeof(size_t));
+        target = new char[targetSize];
+        rf.read(target, targetSize);
+        rf.close();
+
+    }
+
+
+    KeOps_module(int device_id, int nargs_, const char *target_file_name) {
+
+        nargs = nargs_;
+
+        // init cuda in case not already done
+        CUDA_SAFE_CALL(cuInit(0));
+
+        // get the device and the primary context corresponding to device_id
+        CUDA_SAFE_CALL(cuDeviceGet(&cuDevice, device_id));
+        CUDA_SAFE_CALL(cuDevicePrimaryCtxRetain(&ctx, cuDevice));
+
+        // set the primary context as the active current context
+        SetContext();
+
+        // set global variables giving some properties of device
+        SetGpuProps(device_id);
+
+        // read the ptx or cubin file into a char array
+        Read_Target(target_file_name);
+
+        // load the corresponding module
+        CUDA_SAFE_CALL(cuModuleLoadDataEx(&module, target, 0, NULL, NULL));
+
+        // allocate a small memory buffer for "on device" computation mode,
+        // This is just used for storing the list of pointers to device data
+        // as a device array ; it is better to allocate it here once for all,
+        // otherwise allocating it at each call may cause a small overhead.
+        CUDA_SAFE_CALL(cuMemAlloc(&buffer, nargs * sizeof(TYPE *)));
+
+    }
+
+
+    ~KeOps_module() {
+        SetContext();
+        CUDA_SAFE_CALL_NO_EXCEPTION(cuMemFree(buffer));
+        CUDA_SAFE_CALL_NO_EXCEPTION(cuModuleUnload(module));
+        CUDA_SAFE_CALL_NO_EXCEPTION(cuDevicePrimaryCtxRelease(cuDevice));
+        delete[] target;
+    }
+
+    int launch_kernel(int tagHostDevice, int dimY, int nx, int ny,
+                      int tagI, int tagZero, int use_half,
+                      int tag1D2D, int dimred,
+                      int cuda_block_size, int use_chunk_mode,
+                      std::vector< int > indsi, std::vector< int > indsj, std::vector< int > indsp,
+                      int dimout,
+                      std::vector< int > dimsx, std::vector< int > dimsy, std::vector< int > dimsp,
+                      int **ranges,
+                      std::vector< int > shapeout, TYPE *out,
+                      TYPE **arg,
+                      std::vector <std::vector< int >> argshape
+    ) {
+
+
+        SetContext();
+
+        ////end_ = clock();
+        ////std::cout << "  time for set device : " << double(//end_ - start_) / CLOCKS_PER_SEC << std::endl;
+        //start_ = clock();
+
+        Sizes <TYPE> SS(nargs, arg, argshape, nx, ny,
+                        tagI, use_half,
+                        dimout,
+                        indsi, indsj, indsp,
+                        dimsx, dimsy, dimsp);
+
+        //end_ = clock();
+        //std::cout << "  time for Sizes : " << double(//end_ - start_) / CLOCKS_PER_SEC << std::endl;
+        //start_ = clock();
+
+        if (use_half)
+            SS.switch_to_half2_indexing();
+
+        Ranges <TYPE> RR(SS, ranges);
+        nx = SS.nx;
+        ny = SS.ny;
+
+        //end_ = clock();
+        //std::cout << "  time for Ranges : " << double(//end_ - start_) / CLOCKS_PER_SEC << std::endl;
+        //start_ = clock();
+
+        // now we switch (back...) indsi, indsj and dimsx, dimsy in case tagI=1.
+        // This is to be consistent with the convention used in the old
+        // bindings where i and j variables had different meanings in bindings
+        // and in the core code. Clearly we could do better if we
+        // carefully rewrite some parts of the code
+        if (tagI == 1) {
+            std::vector< int > tmp;
+
+            tmp = indsj;
+            indsj = indsi;
+            indsi = tmp;
+
+            tmp = dimsy;
+            dimsy = dimsx;
+            dimsx = tmp;
+        }
+
+
+        int blockSize_x = 1, blockSize_y = 1, blockSize_z = 1;
+
+        if (use_chunk_mode == 0) {
+            // warning : blockSize.x was previously set to CUDA_BLOCK_SIZE; currently CUDA_BLOCK_SIZE value is used as a bound.
+            blockSize_x = std::min(cuda_block_size,
+                                   std::min(maxThreadsPerBlock,
+                                            (int) (sharedMemPerBlock / std::max(1, (int) (dimY * sizeof(TYPE))))
+                                           )
+                                  ); // number of threads in each block
+        } else {
+            // warning : the value here must match the one which is set in file GpuReduc1D_chunks.py, line 59
+            // and file GpuReduc1D_finalchunks.py, line 67
+            blockSize_x = std::min(cuda_block_size,
+                                   std::min(1024, (int) (49152 / std::max(1, (int) (dimY * sizeof(TYPE)))))
+                                  );
+        }
+
+        int nblocks;
+
+        if (tagI == 1) {
+            int tmp = ny;
+            ny = nx;
+            nx = tmp;
+        }
+
+        int *lookup_d = NULL, *slices_x_d = NULL, *ranges_y_d = NULL;
+        int *offsets_d = NULL;
+
+        if (RR.tagRanges == 1) {
+            if (tagHostDevice == 1) {
+                range_preprocess_from_device(nblocks, tagI, RR.nranges_x, RR.nranges_y, RR.castedranges,
+                                             SS.nbatchdims, slices_x_d, ranges_y_d, lookup_d,
+                                             offsets_d,
+                                             blockSize_x, indsi, indsj, indsp, SS.shapes);
+            } else { // tagHostDevice==0
+                range_preprocess_from_host(nblocks, tagI, RR.nranges_x, RR.nranges_y, RR.nredranges_x, RR.nredranges_y,
+                                           RR.castedranges,
+                                           SS.nbatchdims, slices_x_d, ranges_y_d, lookup_d,
+                                           offsets_d,
+                                           blockSize_x, indsi, indsj, indsp, SS.shapes);
+            }
+        }
+
+        ////end_ = clock();
+        ////std::cout << "  time for interm : " << double(//end_ - start_) / CLOCKS_PER_SEC << std::endl;
+        //start_ = clock();
+
+        CUdeviceptr p_data;
+        TYPE *out_d;
+        TYPE **arg_d;
+
+        int sizeout = std::accumulate(shapeout.begin(), shapeout.end(), 1, std::multiplies< int >());
+
+        if (tagHostDevice == 1) {
+            p_data = buffer;
+            load_args_FromDevice(p_data, out, out_d, nargs, arg, arg_d);
+        } else
+            load_args_FromHost(p_data, out, out_d, nargs, arg, arg_d, argshape, sizeout);
+
+        ////end_ = clock();
+        ////std::cout << "  time for load_args : " << double(//end_ - start_) / CLOCKS_PER_SEC << std::endl;
+        //start_ = clock();
+
+        CUfunction kernel;
+
+        int gridSize_x = 1, gridSize_y = 1, gridSize_z = 1;
+
+        if (tag1D2D == 1) { // 2D scheme
+
+            gridSize_x = nx / blockSize_x + (nx % blockSize_x == 0 ? 0 : 1);
+            gridSize_y = ny / blockSize_x + (ny % blockSize_x == 0 ? 0 : 1);
+
+            // Reduce : grid and block are both 1d
+            int blockSize2_x = 1, blockSize2_y = 1, blockSize2_z = 1;
+            blockSize2_x = blockSize_x; // number of threads in each block
+            int gridSize2_x = 1, gridSize2_y = 1, gridSize2_z = 1;
+            gridSize2_x = (nx * dimred) / blockSize2_x + ((nx * dimred) % blockSize2_x == 0 ? 0 : 1);
+
+            // Data on the device. We need an "inflated" outB, which contains gridSize.y "copies" of out
+            // that will be reduced in the final pass.
+            TYPE *outB;
+
+            // single cudaMalloc
+            CUdeviceptr p_data_outB;
+            CUDA_SAFE_CALL(cuMemAlloc(&p_data_outB, sizeof(TYPE) * (nx * dimred * gridSize_y)));
+
+            outB = (TYPE *) ((TYPE **) p_data);
+
+            CUDA_SAFE_CALL(cuModuleGetFunction(&kernel, module, "GpuConv2DOnDevice"));
+
+            void *kernel_params[4];
+            kernel_params[0] = &nx;
+            kernel_params[1] = &ny;
+            kernel_params[2] = &outB;
+            kernel_params[3] = &arg_d;
+
+            // Size of the SharedData : blockSize.x*(DIMY)*sizeof(TYPE)
+
+            CUDA_SAFE_CALL(cuLaunchKernel(kernel,
+                                          gridSize_x, gridSize_y, gridSize_z,      // grid dim
+                                          blockSize_x, blockSize_y, blockSize_z,   // block dim
+                                          blockSize_x * dimY * sizeof(TYPE), NULL, // shared mem and stream
+                                          kernel_params, 0));
+            // block until the device has completed
+            CUDA_SAFE_CALL(cuCtxSynchronize());
+
+            // Since we've used a 2D scheme, there's still a "blockwise" line reduction to make on
+            // the output array px_d[0] = x1B. We go from shape ( gridSize.y * nx, DIMRED ) to (nx, DIMOUT)
+            CUfunction kernel_reduce;
+            CUDA_SAFE_CALL(cuModuleGetFunction(&kernel_reduce, module, "reduce2D"));
+            void *kernel_reduce_params[4];
+            kernel_reduce_params[0] = &outB;
+            kernel_reduce_params[1] = &out_d;
+            kernel_reduce_params[2] = &gridSize_y;
+            kernel_reduce_params[3] = &nx;
+
+            CUDA_SAFE_CALL(cuLaunchKernel(kernel_reduce,
+                                          gridSize2_x, gridSize2_y, gridSize2_z,    // grid dim
+                                          blockSize2_x, blockSize2_y, blockSize2_z,   // block dim
+                                          0, NULL,             // shared mem and stream
+                                          kernel_reduce_params, 0));
+
+
+        } else if (RR.tagRanges == 1 && tagZero == 0) {
+            // ranges mode
+
+            gridSize_x = nblocks;
+
+            CUDA_SAFE_CALL(cuModuleGetFunction(&kernel, module, "GpuConv1DOnDevice_ranges"));
+            // std::cout << "GpuConv1DOnDevice_ranges " << nx << " " << gridSize_x ;
+            void *kernel_params[9];
+            kernel_params[0] = &nx;
+            kernel_params[1] = &ny;
+            kernel_params[2] = &SS.nbatchdims;
+            kernel_params[3] = &offsets_d;
+            kernel_params[4] = &lookup_d;
+            kernel_params[5] = &slices_x_d;
+            kernel_params[6] = &ranges_y_d;
+            kernel_params[7] = &out_d;
+            kernel_params[8] = &arg_d;
+
+            CUDA_SAFE_CALL(cuLaunchKernel(kernel,
+                                          gridSize_x, gridSize_y, gridSize_z,       // grid dim
+                                          blockSize_x, blockSize_y, blockSize_z,    // block dim
+                                          blockSize_x * dimY * sizeof(TYPE), NULL,  // shared mem and stream
+                                          kernel_params, 0));                       // arguments
+
+        } else {
+            // simple mode
+
+            gridSize_x = nx / blockSize_x + (nx % blockSize_x == 0 ? 0 : 1);
+
+            CUDA_SAFE_CALL(cuModuleGetFunction(&kernel, module, "GpuConv1DOnDevice"));
+
+            void *kernel_params[4];
+            kernel_params[0] = &nx;
+            kernel_params[1] = &ny;
+            kernel_params[2] = &out_d;
+            kernel_params[3] = &arg_d;
+
+            //std::cout << "GpuConv1DOnDevice " << nx << " " << gridSize_x ;//<< " " << gridSize_y << " " << gridSize_z << " " <<  blockSize_x << " " << blockSize_y << " " <<  blockSize_z << " " <<   blockSize_x * dimY * sizeof(TYPE)  << std::endl;
+
+            CUDA_SAFE_CALL(cuLaunchKernel(kernel,
+                                          gridSize_x, gridSize_y, gridSize_z,        // grid dim
+                                          blockSize_x, blockSize_y, blockSize_z,     // block dim
+                                          blockSize_x * dimY * sizeof(TYPE), NULL,   // shared mem and stream
+                                          kernel_params, 0));                        // arguments
+        }
+
+        CUDA_SAFE_CALL(cuCtxSynchronize());
+
+        ////end_ = clock();
+        ////std::cout << "  time for kernel : " << double(//end_ - start_) / CLOCKS_PER_SEC << std::endl;
+        //start_ = clock();
+
+        // Send data from device to host.
+        
+
+        if (tagHostDevice == 0) {
+            
+            CUDA_SAFE_CALL(cuMemcpyDtoH(out, (CUdeviceptr) out_d, sizeof(TYPE) * sizeout));
+            CUDA_SAFE_CALL(cuMemFree(p_data));
+            
+        }
+
+        if (RR.tagRanges == 1) {
+            CUDA_SAFE_CALL(cuMemFree((CUdeviceptr) lookup_d));
+            if (SS.nbatchdims > 0) {
+                CUDA_SAFE_CALL(cuMemFree((CUdeviceptr) slices_x_d));
+                CUDA_SAFE_CALL(cuMemFree((CUdeviceptr) ranges_y_d));
+                CUDA_SAFE_CALL(cuMemFree((CUdeviceptr) offsets_d));
+            }
+        }
+
+        //end_ = end = clock();
+        ////std::cout << "  time for last part : " << double(//end_ - start_) / CLOCKS_PER_SEC << std::endl;
+        ////std::cout << "time for launch_keops inner : " << double(end - start) / CLOCKS_PER_SEC << std::endl;
+
+        return 0;
+    }
+
+};
+
+
+template
+class KeOps_module< float >;
+
+template
+class KeOps_module< double >;
+
+template
+class KeOps_module< half2 >;
diff --git a/keopscore/keopscore/binders/nvrtc/nvrtc_jit_win.cpp b/keopscore/keopscore/binders/nvrtc/nvrtc_jit_win.cpp
new file mode 100644
index 000000000..39bb6c72c
--- /dev/null
+++ b/keopscore/keopscore/binders/nvrtc/nvrtc_jit_win.cpp
@@ -0,0 +1,114 @@
+// nvcc -shared -Xcompiler -fPIC -lnvrtc -lcuda keops_nvrtc.cu -o keops_nvrtc.so
+// g++ --verbose -L/opt/cuda/lib64 -L/opt/cuda/targets/x86_64-linux/lib/ -I/opt/cuda/targets/x86_64-linux/include/ -I../../include -shared -fPIC -lcuda -lnvrtc -fpermissive -DMAXIDGPU=0 -DMAXTHREADSPERBLOCK0=1024 -DSHAREDMEMPERBLOCK0=49152 -DnvrtcGetTARGET=nvrtcGetCUBIN -DnvrtcGetTARGETSize=nvrtcGetCUBINSize -DARCHTAG=\"sm\" keops_nvrtc.cpp -o keops_nvrtc.so
+// g++ -std=c++11  -shared -fPIC -O3 -fpermissive -L /usr/lib -L /opt/cuda/lib64 -lcuda -lnvrtc -DnvrtcGetTARGET=nvrtcGetCUBIN -DnvrtcGetTARGETSize=nvrtcGetCUBINSize -DARCHTAG=\"sm\"  -I/home/bcharlier/projets/keops/keops/keops/include -I/opt/cuda/include -I/usr/include/python3.10/ -DMAXIDGPU=0 -DMAXTHREADSPERBLOCK0=1024 -DSHAREDMEMPERBLOCK0=49152  /home/bcharlier/projets/keops/keops/keops/binders/nvrtc/keops_nvrtc.cpp -o keops_nvrtc.cpython-310-x86_64-linux-gnu.so
+
+#include <nvrtc.h>
+#include <cuda.h>
+#include <stdio.h>
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <stdarg.h>
+#include <string.h>
+#include <vector>
+//#include <ctime>
+
+#define C_CONTIGUOUS 1
+#define USE_HALF 0
+
+#ifdef _WIN32
+#define DLL_EXPORT extern "C" __declspec(dllexport)
+#else
+#define DLL_EXPORT extern "C"
+#endif
+
+#include "include/Sizes_win.h"
+#include "include/Ranges_win.h"
+#include "include/utils_pe_win.h"
+#include "include/ranges_utils_win.h"
+
+
+#include "include/CudaSizes_win.h"
+#include <cuda_fp16.h>
+
+DLL_EXPORT
+int Compile(const char *target_file_name, const char *cu_code, int use_half, int device_id,
+                       const char *cuda_include_path) {
+
+    nvrtcProgram prog;
+
+    int numHeaders;
+    const char *header_names[2];
+    const char *header_sources[2];
+
+    std::ostringstream cuda_fp16_h_path, cuda_fp16_hpp_path;
+    cuda_fp16_h_path << cuda_include_path << "cuda_fp16.h" ;
+    cuda_fp16_hpp_path << cuda_include_path << "cuda_fp16.hpp" ;
+
+    if (use_half) {
+        numHeaders = 2;
+        header_names[0] = "cuda_fp16.h";
+        header_sources[0] = read_text_file(cuda_fp16_h_path.str().c_str());
+
+        header_names[1] = "cuda_fp16.hpp";
+        header_sources[1] = read_text_file(cuda_fp16_hpp_path.str().c_str());
+
+    } else {
+        numHeaders = 0;
+    }
+
+    // Get device id from Driver API
+    CUdevice cuDevice;
+    CUDA_SAFE_CALL(cuDeviceGet(&cuDevice, device_id));
+
+    // Get Compute Capability from Driver API
+    int deviceProp_major, deviceProp_minor;
+    CUDA_SAFE_CALL(cuDeviceGetAttribute(&deviceProp_major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
+    CUDA_SAFE_CALL(cuDeviceGetAttribute(&deviceProp_minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
+
+    std::ostringstream arch_flag;
+    arch_flag << "-arch=" << ARCHTAG << "_" << deviceProp_major << deviceProp_minor;
+
+    char *arch_flag_char = new char[arch_flag.str().length()];
+    arch_flag_char = strdup(arch_flag.str().c_str());
+    const char *opts[] = {arch_flag_char, "-use_fast_math"};
+
+    NVRTC_SAFE_CALL(nvrtcCreateProgram(&prog,         // prog
+                                       cu_code,         // buffer
+                                       NULL,            // name
+                                       numHeaders,      // numHeaders
+                                       header_sources,  // headers
+                                       header_names     // includeNames
+                                      ));
+
+    nvrtcResult compileResult = nvrtcCompileProgram(prog,     // prog
+                                2,              // numOptions
+                                opts);          // options
+
+    if (compileResult != NVRTC_SUCCESS) {
+        throw std::runtime_error("[KeOps] Error when compiling formula (error in nvrtcCompileProgram).");
+    }
+
+    delete[] arch_flag_char;
+
+    // Obtain PTX or CUBIN from the program.
+    size_t targetSize;
+    NVRTC_SAFE_CALL(nvrtcGetTARGETSize(prog, &targetSize));
+
+    char *target = new char[targetSize];
+    NVRTC_SAFE_CALL(nvrtcGetTARGET(prog, target));
+
+    // Destroy the program.
+    NVRTC_SAFE_CALL(nvrtcDestroyProgram(&prog));
+
+    // write PTX code to file
+
+    std::ofstream wf(target_file_name, std::ofstream::binary);
+    wf.write((char*)&targetSize, sizeof(size_t));
+    wf.write(target, targetSize);
+    wf.close();
+
+    delete[] target;
+
+    return 0;
+}
diff --git a/keopscore/keopscore/config/__init__.py b/keopscore/keopscore/config/__init__.py
index fc31a0b9f..314189442 100644
--- a/keopscore/keopscore/config/__init__.py
+++ b/keopscore/keopscore/config/__init__.py
@@ -1,6 +1,10 @@
 # Import the configuration classes
 from .base_config import Config
-from .cuda import CUDAConfig
+import os
+if os.name != "nt":
+    from .cuda import CUDAConfig
+else:
+    from .cuda_windows import CUDAConfigWin as CUDAConfig
 from .openmp import OpenMPConfig
 from .Platform import DetectPlatform
 
diff --git a/keopscore/keopscore/config/cuda_windows.py b/keopscore/keopscore/config/cuda_windows.py
new file mode 100644
index 000000000..5431f8b9c
--- /dev/null
+++ b/keopscore/keopscore/config/cuda_windows.py
@@ -0,0 +1,207 @@
+import os
+import ctypes
+from ctypes.util import find_library
+from ctypes import (
+    c_int,
+    c_void_p,
+    c_char_p,
+    CDLL,
+    byref,
+    cast,
+    POINTER,
+    Structure,
+    RTLD_GLOBAL,
+)
+from pathlib import Path
+import shutil
+from os.path import join
+import platform
+import tempfile
+import subprocess
+import sys
+import keopscore
+from keopscore.utils.misc_utils import KeOps_Warning
+from keopscore.utils.misc_utils import KeOps_OS_Run
+from keopscore.utils.misc_utils import CHECK_MARK, CROSS_MARK
+
+from .cuda import CUDAConfig
+
+from ..windows_compilations import cuda_detection
+
+
+
+detection = cuda_detection.detect_cuda_toolkit()
+# cuda_lib = detection['lib_dirs']
+# cuda_include = detection['include_dir']
+# cuda_dll =  detection['dll_cuda']
+# cudart_dll =  detection['dll_cudart']
+# cuda_nvrtc =  detection['dll_nvrtc']
+
+
+class CUDAConfigWin(CUDAConfig):
+    """
+    Class for CUDA detection on windows and configuration.
+    """
+
+    # CUDA constants
+    CUDA_SUCCESS = 0
+    CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1
+    CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8
+
+    def set_use_cuda(self):
+        """Determine and set whether to use CUDA."""
+        self._use_cuda = cuda_detection.cuda_available
+        if not self._cuda_libraries_available():
+            self._use_cuda = False
+        
+        self.get_cuda_version()
+        self.get_cuda_include_path()
+        self.get_gpu_props()
+
+        if self.n_gpus == 0 and self._use_cuda:
+            self._use_cuda = False
+            self.cuda_message = "CUDA libraries detected, but no GPUs found on this system; Switching to CPU only."
+            KeOps_Warning(self.cuda_message)
+
+    def _cuda_libraries_available(self):
+        """
+        Check if both cuda and nvrtc libraries are available.
+        Returns:
+            True if both cuda and nvrtc are loadable, False otherwise.
+            This is also where we handle one single warning if needed.
+        """
+
+        return 'dll_nvrtc' in detection and 'dll_cuda' in detection
+    
+
+    def get_cuda_version(self, out_type="single_value"):
+
+        if not self._use_cuda:
+            self.cuda_version = None
+            return None
+        try:
+
+            libcudart = ctypes.CDLL(detection['dll_cudart'])
+            cuda_version = ctypes.c_int()
+            libcudart.cudaRuntimeGetVersion(ctypes.byref(cuda_version))
+            cuda_version_value = int(cuda_version.value)
+
+            if out_type == "single_value":
+                self.cuda_version = cuda_version_value
+                return cuda_version_value
+
+            major = cuda_version_value // 1000
+            minor = (cuda_version_value % 1000) // 10
+
+            if out_type == "major,minor":
+                return major, minor
+            elif out_type == "string":
+                return f"{major}.{minor}"
+        except Exception as e:
+            KeOps_Warning(f"Could not determine CUDA version: {e}")
+            self.cuda_version = None
+            return None
+        
+    
+    def get_gpu_props(self):
+        """
+        Getting GPU properties and related attributes.
+        """
+        if not self._use_cuda:
+            # Already determined that CUDA is unavailable
+            self.n_gpus = 0
+            self.gpu_compile_flags = ""
+            return (self.n_gpus, self.gpu_compile_flags)
+
+        # Attempt to load the CUDA driver library
+        libcuda_path = detection['dll_cuda']
+
+        # We have a handle, let's proceed
+        libcuda = ctypes.CDLL(libcuda_path)
+        result = libcuda.cuInit(0)
+        if result != self.CUDA_SUCCESS:
+            KeOps_Warning(
+                "CUDA was detected, but driver API could not be initialized. Switching to CPU only."
+            )
+            self.n_gpus = 0
+            self.gpu_compile_flags = ""
+            self._use_cuda = False
+            return (self.n_gpus, self.gpu_compile_flags)
+
+        # Get GPU count
+        nGpus = ctypes.c_int()
+        result = libcuda.cuDeviceGetCount(ctypes.byref(nGpus))
+        if result != self.CUDA_SUCCESS:
+            KeOps_Warning(
+                "CUDA was detected and driver API was initialized, but no working GPU found. "
+                "Switching to CPU only."
+            )
+            self.n_gpus = 0
+            self.gpu_compile_flags = ""
+            self._use_cuda = False
+            return (self.n_gpus, self.gpu_compile_flags)
+
+        self.n_gpus = nGpus.value
+        if self.n_gpus == 0:
+            self.gpu_compile_flags = ""
+            return (self.n_gpus, self.gpu_compile_flags)
+
+        # Query each GPU for properties
+        MaxThreadsPerBlock = [0] * self.n_gpus
+        SharedMemPerBlock = [0] * self.n_gpus
+
+        def safe_call(dev_idx, result_code):
+            if result_code != self.CUDA_SUCCESS:
+                KeOps_Warning(
+                    f"Error detecting properties for GPU device {dev_idx}. "
+                    "Switching to CPU only."
+                )
+                return False
+            return True
+
+        for d in range(self.n_gpus):
+            device = ctypes.c_int()
+            if not safe_call(d, libcuda.cuDeviceGet(ctypes.byref(device), d)):
+                self.n_gpus = 0
+                self.gpu_compile_flags = ""
+                self._use_cuda = False
+                return (self.n_gpus, self.gpu_compile_flags)
+
+            output = ctypes.c_int()
+            if not safe_call(
+                d,
+                libcuda.cuDeviceGetAttribute(
+                    byref(output),
+                    self.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
+                    device,
+                ),
+            ):
+                self.n_gpus = 0
+                self.gpu_compile_flags = ""
+                self._use_cuda = False
+                return (self.n_gpus, self.gpu_compile_flags)
+            MaxThreadsPerBlock[d] = output.value
+
+            if not safe_call(
+                d,
+                libcuda.cuDeviceGetAttribute(
+                    byref(output),
+                    self.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK,
+                    device,
+                ),
+            ):
+                self.n_gpus = 0
+                self.gpu_compile_flags = ""
+                self._use_cuda = False
+                return (self.n_gpus, self.gpu_compile_flags)
+            SharedMemPerBlock[d] = output.value
+
+        # Build compile flags string
+        self.gpu_compile_flags = f"-DMAXIDGPU={self.n_gpus - 1} "
+        for d in range(self.n_gpus):
+            self.gpu_compile_flags += (
+                f"-DMAXTHREADSPERBLOCK{d}={MaxThreadsPerBlock[d]} "
+            )
+            self.gpu_compile_flags += f"-DSHAREDMEMPERBLOCK{d}={SharedMemPerBlock[d]} "
+
+        return self.n_gpus, self.gpu_compile_flags
diff --git a/keopscore/keopscore/include/CudaSizes_win.h b/keopscore/keopscore/include/CudaSizes_win.h
new file mode 100644
index 000000000..09862a030
--- /dev/null
+++ b/keopscore/keopscore/include/CudaSizes_win.h
@@ -0,0 +1,96 @@
+#pragma once
+
+/////////////////////////////////////////////
+//            GPU     Options             //
+/////////////////////////////////////////////
+
+
+
+// fix some Gpu properties
+// CUDA_BLOCK_SIZE gives an upper bound on size of the size of Cuda blocks
+// The actual block size may be lower due to memory limitations, depending on the formula used
+#ifndef CUDA_BLOCK_SIZE
+#define CUDA_BLOCK_SIZE 192
+#endif
+// Here we define the maximum number of threads per block and the shared memory per block
+// These values can depend on the Gpu, although in fact values 1024 and 49152 respectively
+// are the good values for almost all cards.
+// So these values should be fine, but you can check them with GetGpuProps.cu program
+// Here we assume that: either the user has defined MAXIDGPU (=number of Gpu devices minus one)
+// and corresponding specific values MAXTHREADSPERBLOCK0, SHAREDMEMPERBLOCK0, MAXTHREADSPERBLOCK1, SHAREDMEMPERBLOCK1, ...
+// for each device, or MAXIDGPU is not defined, and we will use global MAXTHREADSPERBLOCK and SHAREDMEMPERBLOCK
+#ifndef MAXIDGPU
+// we give default values
+#ifndef MAXTHREADSPERBLOCK
+#define MAXTHREADSPERBLOCK 1024
+#endif
+#ifndef SHAREDMEMPERBLOCK
+#define SHAREDMEMPERBLOCK 49152
+#endif
+#endif
+
+// global variables maxThreadsPerBlock and sharedMemPerBlock may depend on the device, so we will set them at each call using
+// predefined MAXTHREADSPERBLOCK0, SHAREDMEMPERBLOCK0, MAXTHREADSPERBLOCK1, SHAREDMEMPERBLOCK1, etc.
+// through the function SetGpuProps
+int maxThreadsPerBlock, sharedMemPerBlock;
+
+#define SET_GPU_PROPS_MACRO(n) \
+    if(device == n) { \
+      maxThreadsPerBlock = MAXTHREADSPERBLOCK ## n; \
+      sharedMemPerBlock = SHAREDMEMPERBLOCK ## n; \
+      return; \
+    }
+
+// I have not managed to use a "recursive macro" hack, it was not compiling on all systems.
+// This assumes the number of Gpus is <= 10 ; feel free to add more lines if needed !
+void SetGpuProps(int device) {
+
+#if defined(MAXTHREADSPERBLOCK) && defined(SHAREDMEMPERBLOCK)
+    // global values are defined
+    maxThreadsPerBlock = MAXTHREADSPERBLOCK;
+    sharedMemPerBlock = SHAREDMEMPERBLOCK;
+    return;
+#else
+#if MAXIDGPU >= 0
+    SET_GPU_PROPS_MACRO(0)
+#endif
+#if MAXIDGPU >= 1
+    SET_GPU_PROPS_MACRO(1)
+#endif
+#if MAXIDGPU >= 2
+    SET_GPU_PROPS_MACRO(2)
+#endif
+#if MAXIDGPU >= 3
+    SET_GPU_PROPS_MACRO(3)
+#endif
+#if MAXIDGPU >= 4
+    SET_GPU_PROPS_MACRO(4)
+#endif
+#if MAXIDGPU >= 5
+    SET_GPU_PROPS_MACRO(5)
+#endif
+#if MAXIDGPU >= 6
+    SET_GPU_PROPS_MACRO(6)
+#endif
+#if MAXIDGPU >= 7
+    SET_GPU_PROPS_MACRO(7)
+#endif
+#if MAXIDGPU >= 8
+    SET_GPU_PROPS_MACRO(8)
+#endif
+#if MAXIDGPU >= 9
+    SET_GPU_PROPS_MACRO(9)
+#endif
+#if MAXIDGPU >= 10
+    SET_GPU_PROPS_MACRO(10)
+#endif
+#if MAXIDGPU >= 11
+    SET_GPU_PROPS_MACRO(11)
+#endif
+    fprintf( stderr, "invalid Gpu device number. If the number of available Gpus is > 12, add required lines at the end of function SetGpuProps and recompile.\n");
+    throw std::runtime_error("[KeOps] Cuda error.");
+#endif
+
+}
+
+
diff --git a/keopscore/keopscore/include/Ranges_win.h b/keopscore/keopscore/include/Ranges_win.h
new file mode 100644
index 000000000..19270499e
--- /dev/null
+++ b/keopscore/keopscore/include/Ranges_win.h
@@ -0,0 +1,99 @@
+#pragma once
+
+#include "Sizes_win.h"
+
+template< typename TYPE >
+class Ranges {
+public:
+    int tagRanges, nranges_x, nranges_y, nredranges_x, nredranges_y;
+
+    std::vector< int > ranges_i, slices_i, redranges_j;
+    std::vector< int * > _castedranges;
+    int **castedranges;
+
+    Ranges(Sizes< TYPE > sizes, int **ranges) {
+
+        _castedranges.resize(6);
+
+        // Sparsity: should we handle ranges? ======================================
+        if (sizes.nbatchdims == 0) {  // Standard M-by-N computation
+            if (ranges[6][0] == -1) {
+                tagRanges = 0;
+
+                nranges_x = 0;
+                nranges_y = 0;
+
+                nredranges_x = 0;
+                nredranges_y = 0;
+
+            } else {
+                tagRanges = 1;
+                nranges_x = ranges[6][0];
+                nranges_y = ranges[6][3];
+                nredranges_x = ranges[6][5];
+                nredranges_y = ranges[6][2];
+
+                // get the pointers to data to avoid a copy
+                for (int i = 0; i < 6; i++) {
+                    _castedranges[i] = ranges[i];
+                }
+            }
+
+        } else if (ranges[6][0] == -1) {
+            // Batch processing: we'll have to generate a custom, block-diagonal sparsity pattern
+            tagRanges = 1;  // Batch processing is emulated through the block-sparse mode
+
+            // Create new "castedranges" from scratch ------------------------------
+            // With pythonic notations, we'll have:
+            //   castedranges = (ranges_i, slices_i, redranges_j,   ranges_j, slices_j, redranges_i)
+            // with:
+            // - ranges_i    = redranges_i = [ [0,M], [M,2M], ..., [(nbatches-1)M, nbatches*M] ]
+            // - slices_i    = slices_j    = [    1,     2,   ...,   nbatches-1,   nbatches    ]
+            // - redranges_j = ranges_j    = [ [0,N], [N,2N], ..., [(nbatches-1)N, nbatches*N] ]
+
+
+            //int ranges_i[2 * sizes.nbatches];  // ranges_i
+            ranges_i.resize(2 * sizes.nbatches, 0);
+
+            //int slices_i[sizes.nbatches];    // slices_i
+            slices_i.resize(sizes.nbatches, 0);
+
+            //int redranges_j[2 * sizes.nbatches];  // redranges_j
+            redranges_j.resize(2 * sizes.nbatches, 0);
+
+            for (int b = 0; b < sizes.nbatches; b++) {
+                ranges_i[2 * b] = b * sizes.M;
+                ranges_i[2 * b + 1] = (b + 1) * sizes.M;
+                slices_i[b] = (b + 1);
+                redranges_j[2 * b] = b * sizes.N;
+                redranges_j[2 * b + 1] = (b + 1) * sizes.N;
+            }
+
+            _castedranges[0] = &ranges_i[0];
+            _castedranges[1] = &slices_i[0];
+            _castedranges[2] = &redranges_j[0];
+            _castedranges[3] = &redranges_j[0];         // ranges_j
+            _castedranges[4] = &slices_i[0];            // slices_j
+            _castedranges[5] = &ranges_i[0];            // redranges_i
+
+            nranges_x = sizes.nbatches;
+            nredranges_x = sizes.nbatches;
+            nranges_y = sizes.nbatches;
+            nredranges_y = sizes.nbatches;
+        }
+
+        castedranges = &_castedranges[0];
+
+#if do_checks
+        else {
+            throw std::runtime_error(
+                "[KeOps] The 'ranges' argument (block-sparse mode) is not supported with batch processing, "
+                "but we detected " + std::to_string(sizes.nbatchdims) + " > 0 batch dimensions."
+            );
+        }
+#endif
+
+
+    };
+
+};
diff --git a/keopscore/keopscore/include/Sizes_win.h b/keopscore/keopscore/include/Sizes_win.h
new file mode 100644
index 000000000..c78f30e7c
--- /dev/null
+++ b/keopscore/keopscore/include/Sizes_win.h
@@ -0,0 +1,402 @@
+#pragma once
+
+#include <algorithm>
+#include <vector>
+#include <numeric>
+#include <functional>
+
+
+#define MIN(a, b) (((a)<(b))?(a):(b))
+#define MAX(a, b) (((a)<(b))?(b):(a))
+#define MAX3(a, b, c) (MAX(MAX(a,b),c))
+
+#define do_checks 0
+#if do_checks
+void error(std::string message) {
+    throw std::runtime_error(message);
+}
+#endif
+
+
+#if C_CONTIGUOUS
+
+int get_val_batch(std::vector< int > _shape, int nbatch, int b) {
+    return _shape[b];
+}
+
+#else
+
+int get_val_batch(std::vector< int > _shape, int nbatch, int b) {
+    return _shape[nbatch - b];
+}
+
+#endif
+
+template< typename TYPE >
+class Sizes {
+public:
+
+    // attributs
+    int nargs;
+    int nx, ny;
+    int M, N;
+    int nbatchdims;
+    int nbatches;
+
+    std::vector< int > _shapes;
+    int *shapes;
+    std::vector< int > _shape_out;
+    int *shape_out;
+
+    int tagIJ;
+    int use_half;
+    std::vector< int > indsI;
+    std::vector< int > indsJ;
+    std::vector< int > indsP;
+    int pos_first_argI;
+    int pos_first_argJ;
+    int dimout;
+    int nminargs;
+    int nvarsI;
+    int nvarsJ;
+    int nvarsP;
+    std::vector< int > dimsX;
+    std::vector< int > dimsY;
+    std::vector< int > dimsP;
+
+    // constructors
+    Sizes(int _nargs, TYPE **args, const std::vector <std::vector< int >> &argshapes, int _nx, int _ny,
+          int tagIJ_, int use_half_, int dimout_,
+          const std::vector< int > &indsI_, std::vector< int > indsJ_, const std::vector< int > &indsP_,
+          const std::vector< int > &dimsX_, std::vector< int > dimsY_, const std::vector< int > &dimsP_) {
+
+        tagIJ = tagIJ_;
+        use_half = use_half_;
+        indsI = indsI_;
+        indsJ = indsJ_;
+        indsP = indsP_;
+        dimout = dimout_;
+
+        nvarsI = indsI.size();
+        nvarsJ = indsJ.size();
+        nvarsP = indsP.size();
+
+        pos_first_argI = (nvarsI > 0) ? *std::min_element(indsI.begin(), indsI.end()) : -1;
+        pos_first_argJ = (nvarsJ > 0) ? *std::min_element(indsJ.begin(), indsJ.end()) : -1;
+
+        int max_i = (nvarsI > 0) ? *std::max_element(indsI.begin(), indsI.end()) : -1;
+        int max_j = (nvarsJ > 0) ? *std::max_element(indsJ.begin(), indsJ.end()) : -1;
+        int max_p = (nvarsP > 0) ? *std::max_element(indsP.begin(), indsP.end()) : -1;
+
+        nminargs = 1 + MAX3(max_i, max_j, max_p);
+        dimsX = dimsX_;
+        dimsY = dimsY_;
+        dimsP = dimsP_;
+        nargs = _nargs;
+        nx = _nx;
+        ny = _ny;
+
+        // fill shapes wit "batch dimensions" [A, .., B], the table will look like:
+        //
+        // [ A, .., B, M, N, D_out]  -> output
+        // [ A, .., B, M, 1, D_1  ]  -> "i" variable
+        // [ A, .., B, 1, N, D_2  ]  -> "j" variable
+        // [ A, .., B, 1, 1, D_3  ]  -> "parameter"
+        // [ A, .., 1, M, 1, D_4  ]  -> N.B.: we support broadcasting on the batch dimensions!
+        // [ 1, .., 1, M, 1, D_5  ]  ->      (we'll just ask users to fill in the shapes with *explicit* ones)
+        fill_shape(nargs, argshapes);
+
+        check_ranges(argshapes);
+
+        // fill shape_out
+        _shape_out.resize(nbatchdims + 3);
+
+#if C_CONTIGUOUS
+        std::copy(_shapes.begin(), _shapes.begin() + nbatchdims + 3, _shape_out.begin());// Copy the "batch dimensions"
+        _shape_out.erase(_shape_out.begin() + nbatchdims + (1 - tagIJ));
+
+#else
+        std::reverse_copy(_shapes.begin(), _shapes.begin() + nbatchdims + 3,
+                          _shape_out.begin());// Copy the "batch dimensions"
+        _shape_out.erase(_shape_out.begin() + 1
+        tagIJ);
+
+#endif
+
+        // fill nx and ny
+        M = _shapes[nbatchdims];      // = M
+        N = _shapes[nbatchdims + 1];  // = N
+
+        // Compute the product of all "batch dimensions"
+        nbatches = std::accumulate(_shapes.begin(), _shapes.begin() + nbatchdims, 1, std::multiplies< int >());
+
+        nx = nbatches * M;  // = A * ... * B * M
+        ny = nbatches * N;  // = A * ... * B * N
+
+        shapes = &_shapes[0];
+        shape_out = &_shape_out[0];
+    }
+
+
+    // methods
+
+    void switch_to_half2_indexing();
+
+private:
+    void fill_shape(const int nargs, const std::vector <std::vector< int >> &argshapes);
+
+    void check_ranges(const std::vector <std::vector< int >> &argshapes);
+
+    int MN_pos, D_pos;
+};
+
+
+template< typename TYPE >
+void Sizes< TYPE >::fill_shape(const int nargs, const std::vector <std::vector< int >> &argshapes) {
+
+    int pos = std::max(pos_first_argI, pos_first_argJ);
+
+    if (pos > -1) {
+        // Are we working in batch mode? Infer the answer from the first arg =============
+        nbatchdims = argshapes[pos].size() - 2;  // number of batch dimensions = Number of dims of the first tensor - 2
+
+        if (nbatchdims < 0) {
+#if do_checks
+            error("[KeOps] Wrong number of dimensions for arg at position 0: is "
+                  + std::to_string(argshapes[0].size()) + " but should be at least 2."
+                 );
+#endif
+        }
+    } else {
+        nbatchdims = 0;
+    }
+
+#if C_CONTIGUOUS
+    MN_pos = nbatchdims;
+    D_pos = nbatchdims + 1;
+#else
+    D_pos = 0;
+    MN_pos = 1;
+#endif
+
+    // Now, we'll keep track of the output + all arguments' shapes in a large array:
+    _shapes.resize((nargs + 1) * (nbatchdims + 3), 1);
+
+    if (use_half) {
+        if (tagIJ == 0) {
+            _shapes[nbatchdims] = nx % 2 ? nx + 1 : nx;
+            _shapes[nbatchdims + 1] = 2 * ny;
+        } else {
+            _shapes[nbatchdims] = 2 * nx;
+            _shapes[nbatchdims + 1] = ny % 2 ? ny + 1 : ny;
+        }
+    } else {
+        _shapes[nbatchdims] = nx;
+        _shapes[nbatchdims + 1] = ny;
+    }
+
+    _shapes[nbatchdims + 2] = dimout;   // Top right corner: dimension of the output
+
+}
+
+template< typename TYPE >
+void Sizes< TYPE >::check_ranges(const std::vector <std::vector< int >> &argshapes) {
+
+    // Check the compatibility of all tensor shapes ==================================
+    if (nminargs > 0) {
+
+        // Checks args in all the positions that correspond to "i" variables:
+        for (int k = 0; k < nvarsI; k++) {
+            int i = indsI[k];
+
+            // Fill in the (i+1)-th line of the "shapes" array ---------------------------
+            int off_i = (i + 1) * (nbatchdims + 3);
+
+            // Check the number of dimensions --------------------------------------------
+            int ndims = argshapes[i].size();  // Number of dims of the i-th tensor
+
+#if do_checks
+            if (ndims != nbatchdims + 2) {
+                error("[KeOps] Wrong number of dimensions for arg at position " + std::to_string(i)
+                      + " (i type): KeOps detected " + std::to_string(nbatchdims)
+                      + " batch dimensions from the first argument 0, and thus expected "
+                      + std::to_string(nbatchdims + 2)
+                      + " dimensions here, but only received "
+                      + std::to_string(ndims)
+                      + ". Note that KeOps supports broadcasting on batch dimensions, "
+                      + "but still expects 'dummy' unit dimensions in the input shapes, "
+                      + "for the sake of clarity.");
+            }
+#endif
+
+
+
+            // First, the batch dimensions:
+            for (int b = 0; b < nbatchdims; b++) {
+                _shapes[off_i + b] = get_val_batch(argshapes[i], nbatchdims + 2, b);
+
+                // Check that the current value is compatible with what
+                // we've encountered so far, as stored in the first line of "shapes"
+                if (_shapes[off_i + b] != 1) {  // This dimension is not "broadcasted"
+                    if (_shapes[b] == 1) {
+                        _shapes[b] = _shapes[off_i + b];  // -> it becomes the new standard
+                    }
+#if do_checks
+                    else if (_shapes[b] != _shapes[off_i + b]) {
+                        error("[KeOps] Wrong value of the batch dimension "
+                              + std::to_string(b) + " for argument number " + std::to_string(i)
+                              + " : is " + std::to_string(_shapes[off_i + b])
+                              + " but was " + std::to_string(_shapes[b])
+                              + " or 1 in previous arguments.");
+                    }
+#endif
+                }
+            }
+
+            _shapes[off_i + nbatchdims] = argshapes[i][MN_pos];  // = "M"
+            _shapes[off_i + nbatchdims + 2] = argshapes[i][D_pos];  // = "D"
+
+
+#if do_checks
+            // Check the number of "lines":
+            if (_shapes[nbatchdims] != _shapes[off_i + nbatchdims]) {
+                error("[KeOps] Wrong value of the 'i' dimension "
+                      + std::to_string(nbatchdims) + "for arg at position " + std::to_string(i)
+                      + " : is " + std::to_string(_shapes[off_i + nbatchdims])
+                      + " but was " + std::to_string(_shapes[nbatchdims])
+                      + " in previous 'i' arguments.");
+            }
+
+            // And the number of "columns":
+            if (_shapes[off_i + nbatchdims + 2] != static_cast< int >(dimsX[k])) {
+                error("[KeOps] Wrong value of the 'vector size' dimension "
+                      + std::to_string(nbatchdims + 1) + " for arg at position " + std::to_string(i)
+                      + " : is " + std::to_string(_shapes[off_i + nbatchdims + 2])
+                      + " but should be " + std::to_string(dimsX[k]));
+            }
+#endif
+        }
+
+
+        // Checks args in all the positions that correspond to "j" variables:
+        for (int k = 0; k < nvarsJ; k++) {
+            int i = indsJ[k];
+
+            // Check the number of dimensions --------------------------------------------
+            int ndims = argshapes[i].size();  // Number of dims of the i-th tensor
+
+#if do_checks
+            if (ndims != nbatchdims + 2) {
+                error("[KeOps] Wrong number of dimensions for arg at position " + std::to_string(i)
+                      + " (j type): KeOps detected " + std::to_string(nbatchdims)
+                      + " batch dimensions from the first argument 0, and thus expected "
+                      + std::to_string(nbatchdims + 2)
+                      + " dimensions here, but only received "
+                      + std::to_string(ndims)
+                      + ". Note that KeOps supports broadcasting on batch dimensions, "
+                      + "but still expects 'dummy' unit dimensions in the input shapes, "
+                      + "for the sake of clarity.");
+            }
+#endif
+
+            // Fill in the (i+1)-th line of the "shapes" array ---------------------------
+            int off_i = (i + 1) * (nbatchdims + 3);
+
+            // First, the batch dimensions:
+            for (int b = 0; b < nbatchdims; b++) {
+                _shapes[off_i + b] = get_val_batch(argshapes[i], nbatchdims + 2, b);
+
+                // Check that the current value is compatible with what
+                // we've encountered so far, as stored in the first line of "shapes"
+                if (_shapes[off_i + b] != 1) {  // This dimension is not "broadcasted"
+                    if (_shapes[b] == 1) {
+                        _shapes[b] = _shapes[off_i + b];  // -> it becomes the new standard
+                    }
+#if do_checks
+                    else if (_shapes[b] != _shapes[off_i + b]) {
+                        error("[KeOps] Wrong value of the batch dimension "
+                              + std::to_string(b) + " for argument number " + std::to_string(i)
+                              + " : is " + std::to_string(_shapes[off_i + b])
+                              + " but was " + std::to_string(_shapes[b])
+                              + " or 1 in previous arguments.");
+                    }
+#endif
+                }
+            }
+
+            _shapes[off_i + nbatchdims + 1] = argshapes[i][MN_pos];  // = "N"
+            _shapes[off_i + nbatchdims + 2] = argshapes[i][D_pos];  // = "D"
+
+
+#if do_checks
+            // Check the number of "lines":
+            if (_shapes[nbatchdims + 1] != _shapes[off_i + nbatchdims + 1]) {
+                error("[KeOps] Wrong value of the 'j' dimension "
+                      + std::to_string(nbatchdims) + " for arg at position " + std::to_string(i)
+                      + " : is " + std::to_string(_shapes[off_i + nbatchdims + 1])
+                      + " but was " + std::to_string(_shapes[nbatchdims + 1])
+                      + " in previous 'j' arguments.");
+            }
+
+            // And the number of "columns":
+            if (_shapes[off_i + nbatchdims + 2] != static_cast< int >(dimsY[k])) {
+                error("[KeOps] Wrong value of the 'vector size' dimension "
+                      + std::to_string(nbatchdims + 1) + " for arg at position " + std::to_string(i)
+                      + " : is " + std::to_string(_shapes[off_i + nbatchdims + 2])
+                      + " but should be " + std::to_string(dimsY[k]));
+            }
+#endif
+        }
+
+
+        for (int k = 0; k < nvarsP; k++) {
+            int i = indsP[k];
+            // Fill in the (i+1)-th line of the "shapes" array ---------------------------
+            int off_i = (i + 1) * (nbatchdims + 3);
+            // First, the batch dimensions:
+            for (int b = 0; b < nbatchdims; b++) {
+                _shapes[off_i + b] = get_val_batch(argshapes[i], nbatchdims + 2, b);
+            }
+            _shapes[off_i + nbatchdims + 2] = argshapes[i][nbatchdims];  // = "D"
+#if do_checks
+            int dim_param;
+            if (use_half)
+                dim_param = _shapes[off_i + nbatchdims + 2] / 2;
+            else
+                dim_param = _shapes[off_i + nbatchdims + 2];
+            if (dim_param != static_cast< int >(dimsP[k])) {
+                error("[KeOps] Wrong value of the 'vector size' dimension "
+                      + std::to_string(nbatchdims) + " for arg at position " + std::to_string(i)
+                      + " : is " + std::to_string(dim_param)
+                      + " but should be " + std::to_string(dimsP[k]));
+            }
+#endif
+        }
+    }
+
+}
+
+template< typename TYPE >
+void Sizes< TYPE >::switch_to_half2_indexing() {
+    // special case of float16 inputs : because we use half2 type in Cuda codes, we need to divide by two nx, ny, and M, N, or D
+    // values inside the shapes vector.
+    nx = nx / 2;
+    ny = ny / 2;
+    M = M / 2;
+    N = N / 2;
+    _shapes[nbatchdims] = _shapes[nbatchdims] / 2;
+    _shapes[nbatchdims + 1] = _shapes[nbatchdims + 1] / 2;
+    for (int i = 0; i < nargs; i++) {
+        int off_i = (i + 1) * (nbatchdims + 3);
+        // we don't have anymore the category information...
+        // the last three dimensions are either of the form (M,1,D), (1,N,D), or (1,1,D)
+        // where M or N are even in the 2 first cases, or D is even in the third case.
+        if (_shapes[off_i + nbatchdims] > 1)
+            _shapes[off_i + nbatchdims] = _shapes[off_i + nbatchdims] / 2;
+        else if (_shapes[off_i + nbatchdims + 1] > 1)
+            _shapes[off_i + nbatchdims + 1] = _shapes[off_i + nbatchdims + 1] / 2;
+        else
+            _shapes[off_i + nbatchdims + 2] = _shapes[off_i + nbatchdims + 2] / 2;
+    }
+}
+
diff --git a/keopscore/keopscore/include/ranges_utils_win.h b/keopscore/keopscore/include/ranges_utils_win.h
new file mode 100644
index 000000000..372b61c06
--- /dev/null
+++ b/keopscore/keopscore/include/ranges_utils_win.h
@@ -0,0 +1,82 @@
+#pragma once
+
+
+int broadcast_index(int i, int nbatchdims, int *full_shape, int *shape) {
+    int M_N = shape[nbatchdims];
+    int res = i % M_N, step = M_N, full_step = M_N;
+    for (int b = nbatchdims; b > 0; b--) {
+        if (shape[b - 1] != 1) {
+            res += ((i / full_step) % shape[b - 1]) * step;
+        }
+        full_step *= full_shape[b - 1];
+        step *= shape[b - 1];
+    }
+    return res;
+}
+
+void vect_broadcast_index(int i, int nbatchdims, int nvars, int *full_shape,
+                          int *reduced_shapes, int *out, int add_offset = 0) {
+    for (int k = 0; k < nvars; k++) {
+        out[k] = add_offset + broadcast_index(i, nbatchdims, full_shape, reduced_shapes + (nbatchdims + 1) * k);
+    }
+}
+
+void fill_shapes(int nbatchdims, int *shapes, int *shapes_i, int *shapes_j, int *shapes_p,
+                 int tagJ,
+                 const std::vector< int > &indsi,
+                 const std::vector< int > &indsj,
+                 const std::vector< int > &indsp) {
+
+    int sizei = indsi.size();
+    int sizej = indsj.size();
+    int sizep = indsp.size();
+
+    sizei += 1;
+
+    const int tagIJ = tagJ; // 1 if the reduction is made "over j", 0 if it is made "over i"
+
+    // Separate and store the shapes of the "i" and "j" variables + parameters --------------
+    //
+    // N.B.: If tagIJ == 1, the reduction is made over 'j', which is the default mode.
+    //       However, if tagIJ == 0, the reduction is performed over the 'i' variables:
+    //       since "shape" does not change, we must adapt the adress at which we pick information...
+    //
+    // shapes is an array of size (1+nargs)*(nbatchdims+3), which looks like:
+    // [ A, .., B, M, N, D_out]  -> output
+    // [ A, .., B, M, 1, D_1  ]  -> "i" variable
+    // [ A, .., B, 1, N, D_2  ]  -> "j" variable
+    // [ A, .., B, 1, 1, D_3  ]  -> "parameter"
+    // [ A, .., 1, M, 1, D_4  ]  -> N.B.: we support broadcasting on the batch dimensions!
+    // [ 1, .., 1, M, 1, D_5  ]  ->      (we'll just ask users to fill in the shapes with *explicit* ones)
+
+    // First, we fill shapes_i with the "relevant" shapes of the "i" variables,
+    // making it look like, say:
+    // [ A, .., B, M]
+    // [ A, .., 1, M]
+    // [ A, .., A, M]
+    for (int k = 0; k < (sizei - 1); k++) {  // k-th line
+        for (int l = 0; l < nbatchdims; l++) {  // l-th column
+            shapes_i[k * (nbatchdims + 1) + l] = shapes[(1 + indsi[k]) * (nbatchdims + 3) + l];
+        }
+        shapes_i[k * (nbatchdims + 1) + nbatchdims] =
+                shapes[(1 + indsi[k]) * (nbatchdims + 3) + nbatchdims + 1 - tagIJ];
+    }
+
+    // Then, we do the same for shapes_j, but with "N" instead of "M":
+    for (int k = 0; k < sizej; k++) {  // k-th line
+        for (int l = 0; l < nbatchdims; l++) {  // l-th column
+            shapes_j[k * (nbatchdims + 1) + l] = shapes[(1 + indsj[k]) * (nbatchdims + 3) + l];
+        }
+        shapes_j[k * (nbatchdims + 1) + nbatchdims] = shapes[(1 + indsj[k]) * (nbatchdims + 3) + nbatchdims +
+                                                             tagIJ];
+    }
+
+    // And finally for the parameters, with "1" instead of "M":
+    for (int k = 0; k < sizep; k++) {  // k-th line
+        for (int l = 0; l < nbatchdims; l++) {  // l-th column
+            shapes_p[k * (nbatchdims + 1) + l] = shapes[(1 + indsp[k]) * (nbatchdims + 3) + l];
+        }
+        shapes_p[k * (nbatchdims + 1) + nbatchdims] = 1;
+    }
+
+}
\ No newline at end of file
diff --git a/keopscore/keopscore/include/utils_pe_win.h b/keopscore/keopscore/include/utils_pe_win.h
new file mode 100644
index 000000000..6e172e476
--- /dev/null
+++ b/keopscore/keopscore/include/utils_pe_win.h
@@ -0,0 +1,100 @@
+#include <numeric>
+#include <cuda.h>
+
+#define NVRTC_SAFE_CALL(x)                                        \
+  do {                                                            \
+    nvrtcResult result = x;                                       \
+    if (result != NVRTC_SUCCESS) {                                \
+      std::cerr << "\nerror: " #x " failed with error "           \
+                << nvrtcGetErrorString(result) << '\n' << '\n';   \
+      throw std::runtime_error("[KeOps] NVRTC error.");           \
+    }                                                             \
+  } while(0)
+
+#define CUDA_SAFE_CALL_NO_EXCEPTION(x)                            \
+  do {                                                            \
+    CUresult result = x;                                          \
+    if (result != CUDA_SUCCESS) {                                 \
+      const char *msg;                                            \
+      cuGetErrorName(result, &msg);                               \
+      std::cerr << "\n[KeOps] error: " #x " failed with error "   \
+                << msg << '\n' << '\n';                           \
+      exit(1);                                                     \
+    }                                                             \
+  } while(0)
+
+#define CUDA_SAFE_CALL(x)                                        \
+  do {                                                            \
+    CUresult result = x;                                          \
+    if (result != CUDA_SUCCESS) {                                 \
+      const char *msg;                                            \
+      cuGetErrorName(result, &msg);                               \
+      std::cerr << "\n[KeOps] error: " #x " failed with error "   \
+                << msg << '\n' << '\n';                           \
+      throw std::runtime_error("[KeOps] Cuda error.");            \
+    }                                                             \
+  } while(0)
+
+
+char *read_text_file(char const *path) {
+    char *buffer = 0;
+    long length;
+    FILE *f = fopen(path, "rb");
+    if (f) {
+        fseek(f, 0, SEEK_END);
+        length = ftell(f);
+        fseek(f, 0, SEEK_SET);
+        buffer = (char *) malloc((length + 1) * sizeof(char));
+        if (buffer) {
+            int res = fread(buffer, sizeof(char), length, f);
+        }
+        fclose(f);
+    }
+    buffer[length] = '\0';
+    return buffer;
+}
+
+
+
+template<typename TYPE>
+void load_args_FromDevice(CUdeviceptr &p_data, TYPE *out, TYPE *&out_d, int nargs, TYPE **arg, TYPE **&arg_d) {
+    CUDA_SAFE_CALL(cuMemAlloc(&p_data, sizeof(TYPE *) * nargs));
+    out_d = out;
+    arg_d = (TYPE **) p_data;
+    // copy array of pointers
+    CUDA_SAFE_CALL(cuMemcpyHtoD((CUdeviceptr) arg_d, arg, nargs * sizeof(TYPE *)));
+}
+
+
+template<typename TYPE>
+void
+load_args_FromHost(CUdeviceptr &p_data, TYPE *out, TYPE *&out_d, int nargs,
+                   TYPE **arg, TYPE **&arg_d,
+                   const std::vector< std::vector< int > > &argshape,
+                   int sizeout) {
+    std::vector<int> sizes(nargs);
+    int totsize = sizeout;
+    for (int k = 0; k < nargs; k++) {
+        sizes[k] = std::accumulate(argshape[k].begin(), argshape[k].end(), 1, std::multiplies< int >());
+        totsize += sizes[k];
+    }
+
+    CUDA_SAFE_CALL(cuMemAlloc(&p_data, sizeof(TYPE *) * nargs + sizeof(TYPE) * totsize));
+
+    arg_d = (TYPE **) p_data;
+    TYPE *dataloc = (TYPE *) (arg_d + nargs);
+
+    // host array of pointers to device data
+    std::vector<TYPE *> ph(nargs);
+
+    out_d = dataloc;
+    dataloc += sizeout;
+    for (int k = 0; k < nargs; k++) {
+        ph[k] = dataloc;
+        CUDA_SAFE_CALL(cuMemcpyHtoD((CUdeviceptr) dataloc, arg[k], sizeof(TYPE) * sizes[k]));
+        dataloc += sizes[k];
+    }
+
+    // copy array of pointers
+    CUDA_SAFE_CALL(cuMemcpyHtoD((CUdeviceptr) arg_d, ph.data(), nargs * sizeof(TYPE *)));
+}
diff --git a/pykeops/pykeops/common/keops_io/LoadKeOps_nvrtc.py b/pykeops/pykeops/common/keops_io/LoadKeOps_nvrtc.py
index 95ef25489..71e3d621f 100644
--- a/pykeops/pykeops/common/keops_io/LoadKeOps_nvrtc.py
+++ b/pykeops/pykeops/common/keops_io/LoadKeOps_nvrtc.py
@@ -26,6 +26,7 @@ def init_phase2(self):
         pykeops_nvrtc = importlib.import_module("pykeops_nvrtc")
 
         if self.params.c_dtype == "float":
+            breakpoint()
             self.launch_keops = pykeops_nvrtc.KeOps_module_float(
                 self.params.device_id_request,
                 self.params.nargs,
@@ -85,7 +86,12 @@ def compile_jit_binary():
         dllname=pykeops.config.pykeops_nvrtc_name(type="target"),
     )
     pyKeOps_Message("Compiling nvrtc binder for python ... ", flush=True, end="")
-    KeOps_OS_Run(compile_command)
+    if os.name =="nt":
+        from keopscore.windows_compilations import compile_pykeops_nvrtc
+        compile_pykeops_nvrtc(build_folder=get_build_folder())
+    
+    else:
+        KeOps_OS_Run(compile_command)
     pyKeOps_Message("OK", use_tag=False, flush=True)
 
 

From 3631b9e149b906fa495afa72938042d8c3275662 Mon Sep 17 00:00:00 2001
From: Louis Pujol <52413616+Louis-Pujol@users.noreply.github.com>
Date: Fri, 11 Jul 2025 10:23:40 +0200
Subject: [PATCH 08/25] lint

---
 keopscore/keopscore/binders/LinkCompile.py          |  2 +-
 .../keopscore/binders/nvrtc/Gpu_link_compile.py     |  5 +----
 keopscore/keopscore/config/__init__.py              |  1 +
 keopscore/keopscore/config/cuda_windows.py          | 13 +++++--------
 pykeops/pykeops/common/keops_io/LoadKeOps_cpp.py    |  8 ++++----
 pykeops/pykeops/common/keops_io/LoadKeOps_nvrtc.py  |  5 +++--
 6 files changed, 15 insertions(+), 19 deletions(-)

diff --git a/keopscore/keopscore/binders/LinkCompile.py b/keopscore/keopscore/binders/LinkCompile.py
index e561745ce..d1ba38058 100644
--- a/keopscore/keopscore/binders/LinkCompile.py
+++ b/keopscore/keopscore/binders/LinkCompile.py
@@ -88,7 +88,7 @@ def write_code(self):
             f.write(self.code.replace("signed long int", "int"))
         else:
             f.write(self.code)
-            
+
         f.close()
 
     def generate_code(self):
diff --git a/keopscore/keopscore/binders/nvrtc/Gpu_link_compile.py b/keopscore/keopscore/binders/nvrtc/Gpu_link_compile.py
index e0b90774f..9617d94e0 100644
--- a/keopscore/keopscore/binders/nvrtc/Gpu_link_compile.py
+++ b/keopscore/keopscore/binders/nvrtc/Gpu_link_compile.py
@@ -34,10 +34,7 @@
 
 def jit_compile_dll():
     if os.name == "nt":
-        return os.path.join(
-            build_folder,
-            "nvrtc_jit.dll"
-        )
+        return os.path.join(build_folder, "nvrtc_jit.dll")
     else:
         return os.path.join(
             build_folder,
diff --git a/keopscore/keopscore/config/__init__.py b/keopscore/keopscore/config/__init__.py
index 314189442..a9c176ebd 100644
--- a/keopscore/keopscore/config/__init__.py
+++ b/keopscore/keopscore/config/__init__.py
@@ -1,6 +1,7 @@
 # Import the configuration classes
 from .base_config import Config
 import os
+
 if os.name != "nt":
     from .cuda import CUDAConfig
 else:
diff --git a/keopscore/keopscore/config/cuda_windows.py b/keopscore/keopscore/config/cuda_windows.py
index 5431f8b9c..6d73f5c3a 100644
--- a/keopscore/keopscore/config/cuda_windows.py
+++ b/keopscore/keopscore/config/cuda_windows.py
@@ -29,7 +29,6 @@
 from ..windows_compilations import cuda_detection
 
 
-
 detection = cuda_detection.detect_cuda_toolkit()
 # cuda_lib = detection['lib_dirs']
 # cuda_include = detection['include_dir']
@@ -53,7 +52,7 @@ def set_use_cuda(self):
         self._use_cuda = cuda_detection.cuda_available
         if not self._cuda_libraries_available():
             self._use_cuda = False
-        
+
         self.get_cuda_version()
         self.get_cuda_include_path()
         self.get_gpu_props()
@@ -71,8 +70,7 @@ def _cuda_libraries_available(self):
             This is also where we handle one single warning if needed.
         """
 
-        return 'dll_nvrtc' in detection and 'dll_cuda' in detection
-    
+        return "dll_nvrtc" in detection and "dll_cuda" in detection
 
     def get_cuda_version(self, out_type="single_value"):
 
@@ -81,7 +79,7 @@ def get_cuda_version(self, out_type="single_value"):
             return None
         try:
 
-            libcudart = ctypes.CDLL(detection['dll_cudart'])
+            libcudart = ctypes.CDLL(detection["dll_cudart"])
             cuda_version = ctypes.c_int()
             libcudart.cudaRuntimeGetVersion(ctypes.byref(cuda_version))
             cuda_version_value = int(cuda_version.value)
@@ -101,8 +99,7 @@ def get_cuda_version(self, out_type="single_value"):
             KeOps_Warning(f"Could not determine CUDA version: {e}")
             self.cuda_version = None
             return None
-        
-    
+
     def get_gpu_props(self):
         """
         Getting GPU properties and related attributes.
@@ -114,7 +111,7 @@ def get_gpu_props(self):
             return (self.n_gpus, self.gpu_compile_flags)
 
         # Attempt to load the CUDA driver library
-        libcuda_path = detection['dll_cuda']
+        libcuda_path = detection["dll_cuda"]
 
         # We have a handle, let's proceed
         libcuda = ctypes.CDLL(libcuda_path)
diff --git a/pykeops/pykeops/common/keops_io/LoadKeOps_cpp.py b/pykeops/pykeops/common/keops_io/LoadKeOps_cpp.py
index 17e0b26fb..23c9978fb 100644
--- a/pykeops/pykeops/common/keops_io/LoadKeOps_cpp.py
+++ b/pykeops/pykeops/common/keops_io/LoadKeOps_cpp.py
@@ -12,7 +12,6 @@
 from pykeops.config import pykeops_cpp_name, python_includes
 
 
-
 class LoadKeOps_cpp_class(LoadKeOps):
     def __init__(self, *args, fast_init=False):
         super().__init__(*args, fast_init=fast_init)
@@ -191,7 +190,7 @@ def get_pybind11_code(self):
     m.def("launch_pykeops_cpu", &launch_pykeops_{self.params.tag}_cpu < {cpp_dtype[self.params.dtype]} >, "Entry point to keops.");
 }}                     
             """
-        
+
         else:
             return f"""
 #include "{self.params.source_name}"
@@ -302,8 +301,9 @@ def get_pybind11_code(self):
     m.doc() = "pyKeOps: KeOps for pytorch through pybind11 (pytorch flavour).";
     m.def("launch_pykeops_cpu", &launch_pykeops_{self.params.tag}_cpu < {cpp_dtype[self.params.dtype]} >, "Entry point to keops.");
 }}                     
-            """.replace("long", "int64_t")
-
+            """.replace(
+                "long", "int64_t"
+            )
 
 
 LoadKeOps_cpp = Cache_partial(
diff --git a/pykeops/pykeops/common/keops_io/LoadKeOps_nvrtc.py b/pykeops/pykeops/common/keops_io/LoadKeOps_nvrtc.py
index 71e3d621f..755896cf2 100644
--- a/pykeops/pykeops/common/keops_io/LoadKeOps_nvrtc.py
+++ b/pykeops/pykeops/common/keops_io/LoadKeOps_nvrtc.py
@@ -86,10 +86,11 @@ def compile_jit_binary():
         dllname=pykeops.config.pykeops_nvrtc_name(type="target"),
     )
     pyKeOps_Message("Compiling nvrtc binder for python ... ", flush=True, end="")
-    if os.name =="nt":
+    if os.name == "nt":
         from keopscore.windows_compilations import compile_pykeops_nvrtc
+
         compile_pykeops_nvrtc(build_folder=get_build_folder())
-    
+
     else:
         KeOps_OS_Run(compile_command)
     pyKeOps_Message("OK", use_tag=False, flush=True)

From c2027193f801b9aed8d231713d45e045033e0d21 Mon Sep 17 00:00:00 2001
From: Louis Pujol <52413616+Louis-Pujol@users.noreply.github.com>
Date: Fri, 11 Jul 2025 11:34:46 +0200
Subject: [PATCH 09/25] typo in pybind11 code

---
 pykeops/pykeops/common/keops_io/LoadKeOps_cpp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pykeops/pykeops/common/keops_io/LoadKeOps_cpp.py b/pykeops/pykeops/common/keops_io/LoadKeOps_cpp.py
index 23c9978fb..238b50314 100644
--- a/pykeops/pykeops/common/keops_io/LoadKeOps_cpp.py
+++ b/pykeops/pykeops/common/keops_io/LoadKeOps_cpp.py
@@ -162,7 +162,7 @@ def get_pybind11_code(self):
     }}
 
 
-    return launch_keops_{self.params.tag}_cpu< TYPE >(dimY,
+    return launch_keops_cpu_{self.params.tag} < TYPE >(dimY,
                                                       nx,
                                                       ny,
                                                       tagI,

From f13f62b8dc287452122a95b7885a0d67c6f8d072 Mon Sep 17 00:00:00 2001
From: Louis Pujol <52413616+Louis-Pujol@users.noreply.github.com>
Date: Fri, 11 Jul 2025 11:53:39 +0200
Subject: [PATCH 10/25] add `windows_compilation` package to `setup.py`

---
 keopscore/setup.py                               | 1 +
 pykeops/pykeops/common/keops_io/LoadKeOps_cpp.py | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/keopscore/setup.py b/keopscore/setup.py
index a0854b4a2..dcbae7d02 100644
--- a/keopscore/setup.py
+++ b/keopscore/setup.py
@@ -59,6 +59,7 @@
         "keopscore.mapreduce.cpu",
         "keopscore.mapreduce.gpu",
         "keopscore.utils",
+        "keopscore.windows_compilations",
     ],
     package_data={
         "keopscore": [
diff --git a/pykeops/pykeops/common/keops_io/LoadKeOps_cpp.py b/pykeops/pykeops/common/keops_io/LoadKeOps_cpp.py
index 238b50314..19d7aaa81 100644
--- a/pykeops/pykeops/common/keops_io/LoadKeOps_cpp.py
+++ b/pykeops/pykeops/common/keops_io/LoadKeOps_cpp.py
@@ -11,6 +11,8 @@
 from keopscore.utils.misc_utils import KeOps_OS_Run
 from pykeops.config import pykeops_cpp_name, python_includes
 
+#TODO limit code duplication for get_pybind11_code()
+
 
 class LoadKeOps_cpp_class(LoadKeOps):
     def __init__(self, *args, fast_init=False):

From 0ae6e891d3eb91cc3583d6b846c75179af8a768d Mon Sep 17 00:00:00 2001
From: Louis Pujol <52413616+Louis-Pujol@users.noreply.github.com>
Date: Fri, 11 Jul 2025 16:11:47 +0200
Subject: [PATCH 11/25] update setup.pys (ugly fix for version on windows)

---
 keopscore/setup.py | 16 ++++++++++++++++
 pykeops/setup.py   |  9 +++++++++
 2 files changed, 25 insertions(+)

diff --git a/keopscore/setup.py b/keopscore/setup.py
index dcbae7d02..b961f0c40 100644
--- a/keopscore/setup.py
+++ b/keopscore/setup.py
@@ -11,6 +11,14 @@
 with open(os.path.join(here, "keopscore", "keops_version"), encoding="utf-8") as v:
     current_version = v.read().rstrip()
 
+#TODO fix this (issues with symlinks on windows ? -> moving to pyproject.toml ?)
+if os.name == "nt":
+    with open(os.path.join(here, "..", "keops_version"), encoding="utf-8") as v:
+        current_version = v.read().rstrip()
+    # copy the content to keopscore/keops_version
+    with open(os.path.join(here, "keopscore", "keops_version"), "w", encoding="utf-8") as v:
+        v.write(current_version)
+
 # Get the long description from the README file
 with open(path.join(here, "keopscore", "readme.md"), encoding="utf-8") as f:
     long_description = f.read()
@@ -74,6 +82,14 @@
             "include/Ranges.h",
             "include/Sizes.h",
             "include/utils_pe.h",
+            "binders/nvrtc/keops_nvrtc_win.cpp",
+            "binders/nvrtc/nvrtc_jit_win.cpp",
+            "include/CudaSizes_win.h",
+            "include/ranges_utils_win.h",
+            "include/Ranges_win.h",
+            "include/Sizes_win.h",
+            "include/utils_pe_win.h",
+            "windows_compilations/templates/CMakeLists.txt",
         ],
     },
     install_requires=[],
diff --git a/pykeops/setup.py b/pykeops/setup.py
index d8154d91f..f32a9a699 100644
--- a/pykeops/setup.py
+++ b/pykeops/setup.py
@@ -12,6 +12,14 @@
 with open(os.path.join(here, "pykeops", "keops_version"), encoding="utf-8") as v:
     current_version = v.read().rstrip()
 
+#TODO fix this (issues with symlinks on windows ? -> moving to pyproject.toml ?)
+if os.name == "nt":
+    with open(os.path.join(here, "..", "keops_version"), encoding="utf-8") as v:
+        current_version = v.read().rstrip()
+    # copy the content to pykeops/keops_version
+    with open(os.path.join(here, "pykeops", "keops_version"), "w", encoding="utf-8") as v:
+        v.write(current_version)
+
 # Get the long description from the README file
 with open(path.join(here, "pykeops", "readme.md"), encoding="utf-8") as f:
     long_description = f.read()
@@ -62,6 +70,7 @@
             "licence.txt",
             "keops_version",
             "common/keops_io/pykeops_nvrtc.cpp",
+            "common/keops_io/pykeops_nvrtc_win.cpp",
         ],
     },
     install_requires=["numpy", "pybind11", "keopscore"],

From 28884242e8d6091cc76873705240fc3615b1e23b Mon Sep 17 00:00:00 2001
From: Louis Pujol <52413616+Louis-Pujol@users.noreply.github.com>
Date: Fri, 11 Jul 2025 16:14:45 +0200
Subject: [PATCH 12/25] remove breakpoints

---
 keopscore/keopscore/binders/nvrtc/Gpu_link_compile.py | 1 -
 pykeops/pykeops/common/keops_io/LoadKeOps_nvrtc.py    | 1 -
 2 files changed, 2 deletions(-)

diff --git a/keopscore/keopscore/binders/nvrtc/Gpu_link_compile.py b/keopscore/keopscore/binders/nvrtc/Gpu_link_compile.py
index 9617d94e0..4833cec95 100644
--- a/keopscore/keopscore/binders/nvrtc/Gpu_link_compile.py
+++ b/keopscore/keopscore/binders/nvrtc/Gpu_link_compile.py
@@ -80,7 +80,6 @@ def generate_code(self):
         self.write_code()
         # we execute the main dll, passing the code as argument, and the name of the low level code file to save the assembly instructions
 
-        breakpoint()
         if os.name != "nt":
             res = self.my_c_dll.Compile(
                 create_string_buffer(self.low_level_code_file),
diff --git a/pykeops/pykeops/common/keops_io/LoadKeOps_nvrtc.py b/pykeops/pykeops/common/keops_io/LoadKeOps_nvrtc.py
index 755896cf2..13e6cdc8e 100644
--- a/pykeops/pykeops/common/keops_io/LoadKeOps_nvrtc.py
+++ b/pykeops/pykeops/common/keops_io/LoadKeOps_nvrtc.py
@@ -26,7 +26,6 @@ def init_phase2(self):
         pykeops_nvrtc = importlib.import_module("pykeops_nvrtc")
 
         if self.params.c_dtype == "float":
-            breakpoint()
             self.launch_keops = pykeops_nvrtc.KeOps_module_float(
                 self.params.device_id_request,
                 self.params.nargs,

From 830deb76e8ff154c87aeeefca66f8dbe94699a9b Mon Sep 17 00:00:00 2001
From: Louis Pujol <52413616+Louis-Pujol@users.noreply.github.com>
Date: Fri, 11 Jul 2025 16:32:05 +0200
Subject: [PATCH 13/25] cuda detection: set use cuda to false if CUDA_PATH is
 not accessible

---
 keopscore/keopscore/config/cuda_windows.py                 | 6 ++++++
 keopscore/keopscore/windows_compilations/cuda_detection.py | 7 +++----
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/keopscore/keopscore/config/cuda_windows.py b/keopscore/keopscore/config/cuda_windows.py
index 6d73f5c3a..8dbaa554c 100644
--- a/keopscore/keopscore/config/cuda_windows.py
+++ b/keopscore/keopscore/config/cuda_windows.py
@@ -50,6 +50,12 @@ class CUDAConfigWin(CUDAConfig):
     def set_use_cuda(self):
         """Determine and set whether to use CUDA."""
         self._use_cuda = cuda_detection.cuda_available
+        
+        if not self._use_cuda:
+            self.cuda_message = "CUDA libraries not detected; Switching to CPU only."
+            KeOps_Warning(self.cuda_message)
+        
+        # Check if both cuda and nvrtc libraries are available
         if not self._cuda_libraries_available():
             self._use_cuda = False
 
diff --git a/keopscore/keopscore/windows_compilations/cuda_detection.py b/keopscore/keopscore/windows_compilations/cuda_detection.py
index 43513c95d..63e8ad2be 100644
--- a/keopscore/keopscore/windows_compilations/cuda_detection.py
+++ b/keopscore/keopscore/windows_compilations/cuda_detection.py
@@ -2,14 +2,13 @@
 from ctypes.util import find_library
 from pathlib import Path
 
-cuda_available = find_library("nvcuda") is not None
-
+cuda_available = "CUDA_PATH" in os.environ
 
 def detect_cuda_toolkit():
 
-    if os.environ["CUDA_PATH"]:
+    output = {}
 
-        output = {}
+    if cuda_available:
 
         cuda_path = Path(
             os.environ["CUDA_PATH"]

From 5a51e698bf70720819dfe7b36953c87dd7ae238f Mon Sep 17 00:00:00 2001
From: Louis Pujol <52413616+Louis-Pujol@users.noreply.github.com>
Date: Fri, 11 Jul 2025 16:32:32 +0200
Subject: [PATCH 14/25] lint

---
 keopscore/keopscore/config/cuda_windows.py                 | 4 ++--
 keopscore/keopscore/windows_compilations/cuda_detection.py | 1 +
 keopscore/setup.py                                         | 6 ++++--
 pykeops/pykeops/common/keops_io/LoadKeOps_cpp.py           | 2 +-
 pykeops/setup.py                                           | 6 ++++--
 5 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/keopscore/keopscore/config/cuda_windows.py b/keopscore/keopscore/config/cuda_windows.py
index 8dbaa554c..21d5d6f4e 100644
--- a/keopscore/keopscore/config/cuda_windows.py
+++ b/keopscore/keopscore/config/cuda_windows.py
@@ -50,11 +50,11 @@ class CUDAConfigWin(CUDAConfig):
     def set_use_cuda(self):
         """Determine and set whether to use CUDA."""
         self._use_cuda = cuda_detection.cuda_available
-        
+
         if not self._use_cuda:
             self.cuda_message = "CUDA libraries not detected; Switching to CPU only."
             KeOps_Warning(self.cuda_message)
-        
+
         # Check if both cuda and nvrtc libraries are available
         if not self._cuda_libraries_available():
             self._use_cuda = False
diff --git a/keopscore/keopscore/windows_compilations/cuda_detection.py b/keopscore/keopscore/windows_compilations/cuda_detection.py
index 63e8ad2be..c6de764c3 100644
--- a/keopscore/keopscore/windows_compilations/cuda_detection.py
+++ b/keopscore/keopscore/windows_compilations/cuda_detection.py
@@ -4,6 +4,7 @@
 
 cuda_available = "CUDA_PATH" in os.environ
 
+
 def detect_cuda_toolkit():
 
     output = {}
diff --git a/keopscore/setup.py b/keopscore/setup.py
index b961f0c40..a68664515 100644
--- a/keopscore/setup.py
+++ b/keopscore/setup.py
@@ -11,12 +11,14 @@
 with open(os.path.join(here, "keopscore", "keops_version"), encoding="utf-8") as v:
     current_version = v.read().rstrip()
 
-#TODO fix this (issues with symlinks on windows ? -> moving to pyproject.toml ?)
+# TODO fix this (issues with symlinks on windows ? -> moving to pyproject.toml ?)
 if os.name == "nt":
     with open(os.path.join(here, "..", "keops_version"), encoding="utf-8") as v:
         current_version = v.read().rstrip()
     # copy the content to keopscore/keops_version
-    with open(os.path.join(here, "keopscore", "keops_version"), "w", encoding="utf-8") as v:
+    with open(
+        os.path.join(here, "keopscore", "keops_version"), "w", encoding="utf-8"
+    ) as v:
         v.write(current_version)
 
 # Get the long description from the README file
diff --git a/pykeops/pykeops/common/keops_io/LoadKeOps_cpp.py b/pykeops/pykeops/common/keops_io/LoadKeOps_cpp.py
index 19d7aaa81..961568023 100644
--- a/pykeops/pykeops/common/keops_io/LoadKeOps_cpp.py
+++ b/pykeops/pykeops/common/keops_io/LoadKeOps_cpp.py
@@ -11,7 +11,7 @@
 from keopscore.utils.misc_utils import KeOps_OS_Run
 from pykeops.config import pykeops_cpp_name, python_includes
 
-#TODO limit code duplication for get_pybind11_code()
+# TODO limit code duplication for get_pybind11_code()
 
 
 class LoadKeOps_cpp_class(LoadKeOps):
diff --git a/pykeops/setup.py b/pykeops/setup.py
index f32a9a699..9dc2c2fd1 100644
--- a/pykeops/setup.py
+++ b/pykeops/setup.py
@@ -12,12 +12,14 @@
 with open(os.path.join(here, "pykeops", "keops_version"), encoding="utf-8") as v:
     current_version = v.read().rstrip()
 
-#TODO fix this (issues with symlinks on windows ? -> moving to pyproject.toml ?)
+# TODO fix this (issues with symlinks on windows ? -> moving to pyproject.toml ?)
 if os.name == "nt":
     with open(os.path.join(here, "..", "keops_version"), encoding="utf-8") as v:
         current_version = v.read().rstrip()
     # copy the content to pykeops/keops_version
-    with open(os.path.join(here, "pykeops", "keops_version"), "w", encoding="utf-8") as v:
+    with open(
+        os.path.join(here, "pykeops", "keops_version"), "w", encoding="utf-8"
+    ) as v:
         v.write(current_version)
 
 # Get the long description from the README file

From 52f9b7eebb0bce8debf438b3a06e9518b155f1d5 Mon Sep 17 00:00:00 2001
From: Louis Pujol <52413616+Louis-Pujol@users.noreply.github.com>
Date: Fri, 11 Jul 2025 17:01:04 +0200
Subject: [PATCH 15/25] fix cuda detection

---
 keopscore/keopscore/config/__init__.py        |  2 ++
 keopscore/keopscore/config/cuda_windows.py    |  3 +++
 .../windows_compilations/detection.py         | 24 +++++++++----------
 3 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/keopscore/keopscore/config/__init__.py b/keopscore/keopscore/config/__init__.py
index a9c176ebd..11fa028ce 100644
--- a/keopscore/keopscore/config/__init__.py
+++ b/keopscore/keopscore/config/__init__.py
@@ -6,6 +6,8 @@
     from .cuda import CUDAConfig
 else:
     from .cuda_windows import CUDAConfigWin as CUDAConfig
+
+#TODO openmp and c++ compiler detection for windows
 from .openmp import OpenMPConfig
 from .Platform import DetectPlatform
 
diff --git a/keopscore/keopscore/config/cuda_windows.py b/keopscore/keopscore/config/cuda_windows.py
index 21d5d6f4e..e23c4240b 100644
--- a/keopscore/keopscore/config/cuda_windows.py
+++ b/keopscore/keopscore/config/cuda_windows.py
@@ -29,6 +29,8 @@
 from ..windows_compilations import cuda_detection
 
 
+cuda_available = cuda_detection.cuda_available
+
 detection = cuda_detection.detect_cuda_toolkit()
 # cuda_lib = detection['lib_dirs']
 # cuda_include = detection['include_dir']
@@ -52,6 +54,7 @@ def set_use_cuda(self):
         self._use_cuda = cuda_detection.cuda_available
 
         if not self._use_cuda:
+            print("fzefgrrtg")
             self.cuda_message = "CUDA libraries not detected; Switching to CPU only."
             KeOps_Warning(self.cuda_message)
 
diff --git a/keopscore/keopscore/windows_compilations/detection.py b/keopscore/keopscore/windows_compilations/detection.py
index 57548129b..059d3c9fe 100644
--- a/keopscore/keopscore/windows_compilations/detection.py
+++ b/keopscore/keopscore/windows_compilations/detection.py
@@ -4,7 +4,7 @@
 
 import pybind11
 
-from .cuda_detection import detect_cuda_toolkit
+from .cuda_detection import detect_cuda_toolkit, cuda_available
 from .utils import find_package_location
 
 include_dirs = {}
@@ -45,18 +45,18 @@
 
 
 # Cuda
-cuda_config = detect_cuda_toolkit()
+if cuda_available:
+    cuda_config = detect_cuda_toolkit()
+    for key in ["cuda", "nvrtc", "cudart"]:
 
-for key in ["cuda", "nvrtc", "cudart"]:
+        if f"dll_{key}" in cuda_config:
+            dlls[key] = cuda_config[f"dll_{key}"]
 
-    if f"dll_{key}" in cuda_config:
-        dlls[key] = cuda_config[f"dll_{key}"]
+        if key in cuda_config["lib_names"]:
+            lib_names[key] = cuda_config["lib_names"][key]
 
-    if key in cuda_config["lib_names"]:
-        lib_names[key] = cuda_config["lib_names"][key]
+    if "include_dir" in cuda_config:
+        include_dirs["cuda"] = cuda_config["include_dir"]
 
-if "include_dir" in cuda_config:
-    include_dirs["cuda"] = cuda_config["include_dir"]
-
-if "lib_dirs" in cuda_config:
-    lib_dirs["cuda"] = cuda_config["lib_dirs"]
+    if "lib_dirs" in cuda_config:
+        lib_dirs["cuda"] = cuda_config["lib_dirs"]

From e1f304c96b5d8b470db5245371c9923203c6fd76 Mon Sep 17 00:00:00 2001
From: Louis Pujol <52413616+Louis-Pujol@users.noreply.github.com>
Date: Fri, 11 Jul 2025 17:01:43 +0200
Subject: [PATCH 16/25] lint

---
 keopscore/keopscore/config/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/keopscore/keopscore/config/__init__.py b/keopscore/keopscore/config/__init__.py
index 11fa028ce..839b05793 100644
--- a/keopscore/keopscore/config/__init__.py
+++ b/keopscore/keopscore/config/__init__.py
@@ -7,7 +7,7 @@
 else:
     from .cuda_windows import CUDAConfigWin as CUDAConfig
 
-#TODO openmp and c++ compiler detection for windows
+# TODO openmp and c++ compiler detection for windows
 from .openmp import OpenMPConfig
 from .Platform import DetectPlatform
 

From 0222d16f3162dd1b705248ef161a443a95f5013f Mon Sep 17 00:00:00 2001
From: Louis Pujol <52413616+Louis-Pujol@users.noreply.github.com>
Date: Fri, 11 Jul 2025 19:45:28 +0200
Subject: [PATCH 17/25] fix argument types in call_keops for windows

---
 pykeops/pykeops/common/keops_io/LoadKeOps_nvrtc.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pykeops/pykeops/common/keops_io/LoadKeOps_nvrtc.py b/pykeops/pykeops/common/keops_io/LoadKeOps_nvrtc.py
index 13e6cdc8e..f46aed6c1 100644
--- a/pykeops/pykeops/common/keops_io/LoadKeOps_nvrtc.py
+++ b/pykeops/pykeops/common/keops_io/LoadKeOps_nvrtc.py
@@ -66,7 +66,7 @@ def call_keops(self, nx, ny):
             self.params.dimsp,
             self.ranges_ptr_new,
             self.outshape,
-            self.out_ptr,
+            self.out_ptr if os.name != "nt" else (self.out_ptr, ),
             self.args_ptr_new,
             self.argshapes_new,
         )

From 593b0594b3b35160d4be14389eb4b41cd38912ba Mon Sep 17 00:00:00 2001
From: Louis Pujol <52413616+Louis-Pujol@users.noreply.github.com>
Date: Fri, 11 Jul 2025 20:18:58 +0200
Subject: [PATCH 18/25] change type annotations for paths

---
 keopscore/keopscore/windows_compilations/compile.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/keopscore/keopscore/windows_compilations/compile.py b/keopscore/keopscore/windows_compilations/compile.py
index f0c556ea1..aa1b72f24 100644
--- a/keopscore/keopscore/windows_compilations/compile.py
+++ b/keopscore/keopscore/windows_compilations/compile.py
@@ -6,15 +6,14 @@
 
 from .globals import tmp_dir
 
-path_type = str | os.PathLike | bytes
 _empty_list = []
 
 
 def compile(
-    source_file: path_type,
+    source_file: os.PathLike,
     project_name: str | None = None,
-    includes: list[path_type] | None = _empty_list,
-    link_dirs: list[path_type] | None = _empty_list,
+    includes: list[os.PathLike] | None = _empty_list,
+    link_dirs: list[os.PathLike] | None = _empty_list,
     links: list[str] | None = _empty_list,
     macros: list[str] | None = _empty_list,
     suffix: str = ".dll",

From 3144bad1b8a4d180337bd4e822170a0c1254f6f2 Mon Sep 17 00:00:00 2001
From: Louis Pujol <52413616+Louis-Pujol@users.noreply.github.com>
Date: Fri, 11 Jul 2025 20:28:53 +0200
Subject: [PATCH 19/25] fix annotations issues in compile.py

---
 keopscore/keopscore/windows_compilations/compile.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/keopscore/keopscore/windows_compilations/compile.py b/keopscore/keopscore/windows_compilations/compile.py
index aa1b72f24..139dcb5e9 100644
--- a/keopscore/keopscore/windows_compilations/compile.py
+++ b/keopscore/keopscore/windows_compilations/compile.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import os
 import shutil
 import sysconfig
@@ -6,14 +8,15 @@
 
 from .globals import tmp_dir
 
+path_type = str | os.PathLike
 _empty_list = []
 
 
 def compile(
-    source_file: os.PathLike,
+    source_file: path_type,
     project_name: str | None = None,
-    includes: list[os.PathLike] | None = _empty_list,
-    link_dirs: list[os.PathLike] | None = _empty_list,
+    includes: list[path_type] | None = _empty_list,
+    link_dirs: list[path_type] | None = _empty_list,
     links: list[str] | None = _empty_list,
     macros: list[str] | None = _empty_list,
     suffix: str = ".dll",

From f3813c66b32828ae16d7087ea301adbf9de80904 Mon Sep 17 00:00:00 2001
From: Louis Pujol <52413616+Louis-Pujol@users.noreply.github.com>
Date: Fri, 11 Jul 2025 20:32:43 +0200
Subject: [PATCH 20/25] fix typing for paths

---
 keopscore/keopscore/windows_compilations/compile.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/keopscore/keopscore/windows_compilations/compile.py b/keopscore/keopscore/windows_compilations/compile.py
index 139dcb5e9..9ee91f622 100644
--- a/keopscore/keopscore/windows_compilations/compile.py
+++ b/keopscore/keopscore/windows_compilations/compile.py
@@ -8,15 +8,13 @@
 
 from .globals import tmp_dir
 
-path_type = str | os.PathLike
 _empty_list = []
 
-
 def compile(
-    source_file: path_type,
+    source_file: os.PathLike,
     project_name: str | None = None,
-    includes: list[path_type] | None = _empty_list,
-    link_dirs: list[path_type] | None = _empty_list,
+    includes: list[os.PathLike] | None = _empty_list,
+    link_dirs: list[os.PathLike] | None = _empty_list,
     links: list[str] | None = _empty_list,
     macros: list[str] | None = _empty_list,
     suffix: str = ".dll",

From eb0e52aeaeabd93bb5c229fa521d88f46230e5e9 Mon Sep 17 00:00:00 2001
From: Louis Pujol <52413616+Louis-Pujol@users.noreply.github.com>
Date: Fri, 11 Jul 2025 20:38:09 +0200
Subject: [PATCH 21/25] remove dummy print

---
 keopscore/keopscore/config/cuda_windows.py          | 1 -
 keopscore/keopscore/windows_compilations/compile.py | 1 +
 pykeops/pykeops/common/keops_io/LoadKeOps_nvrtc.py  | 2 +-
 3 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/keopscore/keopscore/config/cuda_windows.py b/keopscore/keopscore/config/cuda_windows.py
index e23c4240b..47a8b4cee 100644
--- a/keopscore/keopscore/config/cuda_windows.py
+++ b/keopscore/keopscore/config/cuda_windows.py
@@ -54,7 +54,6 @@ def set_use_cuda(self):
         self._use_cuda = cuda_detection.cuda_available
 
         if not self._use_cuda:
-            print("fzefgrrtg")
             self.cuda_message = "CUDA libraries not detected; Switching to CPU only."
             KeOps_Warning(self.cuda_message)
 
diff --git a/keopscore/keopscore/windows_compilations/compile.py b/keopscore/keopscore/windows_compilations/compile.py
index 9ee91f622..d01699e4a 100644
--- a/keopscore/keopscore/windows_compilations/compile.py
+++ b/keopscore/keopscore/windows_compilations/compile.py
@@ -10,6 +10,7 @@
 
 _empty_list = []
 
+
 def compile(
     source_file: os.PathLike,
     project_name: str | None = None,
diff --git a/pykeops/pykeops/common/keops_io/LoadKeOps_nvrtc.py b/pykeops/pykeops/common/keops_io/LoadKeOps_nvrtc.py
index f46aed6c1..651d8d9b7 100644
--- a/pykeops/pykeops/common/keops_io/LoadKeOps_nvrtc.py
+++ b/pykeops/pykeops/common/keops_io/LoadKeOps_nvrtc.py
@@ -66,7 +66,7 @@ def call_keops(self, nx, ny):
             self.params.dimsp,
             self.ranges_ptr_new,
             self.outshape,
-            self.out_ptr if os.name != "nt" else (self.out_ptr, ),
+            self.out_ptr if os.name != "nt" else (self.out_ptr,),
             self.args_ptr_new,
             self.argshapes_new,
         )

From 192c0dfe95943dc7bb00bd3a33bbb6e4c5baa9ec Mon Sep 17 00:00:00 2001
From: Louis Pujol <52413616+Louis-Pujol@users.noreply.github.com>
Date: Fri, 19 Sep 2025 17:25:58 +0200
Subject: [PATCH 22/25] use cuda-pathfinder to find cuda dlls on windows

---
 keopscore/keopscore/config/cuda_windows.py    |   2 +-
 .../compile_pykeops_nvrtc.py                  |  12 +-
 .../windows_compilations/cuda_detection.py    |  10 +-
 .../cuda_detection/CMakeLists.txt             | 127 ++++++++++++++++++
 keopscore/setup.py                            |   5 +-
 5 files changed, 144 insertions(+), 12 deletions(-)
 create mode 100644 keopscore/keopscore/windows_compilations/cuda_detection/CMakeLists.txt

diff --git a/keopscore/keopscore/config/cuda_windows.py b/keopscore/keopscore/config/cuda_windows.py
index 47a8b4cee..0660ea010 100644
--- a/keopscore/keopscore/config/cuda_windows.py
+++ b/keopscore/keopscore/config/cuda_windows.py
@@ -201,7 +201,7 @@ def safe_call(dev_idx, result_code):
                 return (self.n_gpus, self.gpu_compile_flags)
             SharedMemPerBlock[d] = output.value
 
-        # Build compile flags string
+        # Build compile flags string #TODO auto detection for macros is here
         self.gpu_compile_flags = f"-DMAXIDGPU={self.n_gpus - 1} "
         for d in range(self.n_gpus):
             self.gpu_compile_flags += (
diff --git a/keopscore/keopscore/windows_compilations/compile_pykeops_nvrtc.py b/keopscore/keopscore/windows_compilations/compile_pykeops_nvrtc.py
index e27a09c3e..01478569d 100644
--- a/keopscore/keopscore/windows_compilations/compile_pykeops_nvrtc.py
+++ b/keopscore/keopscore/windows_compilations/compile_pykeops_nvrtc.py
@@ -15,12 +15,12 @@ def compile_pykeops_nvrtc(build_folder):
     source_file = pykeops_dir / "common" / "keops_io" / "pykeops_nvrtc_win.cpp"
 
     macros = [
-        "-DMAXIDGPU=0",
-        "-DMAXTHREADSPERBLOCK0=1024",
-        "-DSHAREDMEMPERBLOCK0=49152",
-        "-DnvrtcGetTARGET=nvrtcGetCUBIN",
-        "-DnvrtcGetTARGETSize=nvrtcGetCUBINSize",
-        '-DARCHTAG="sm"',
+        "-DMAXIDGPU=0", #TODO should be detected
+        "-DMAXTHREADSPERBLOCK0=1024", #TODO should be detected
+        "-DSHAREDMEMPERBLOCK0=49152", #TODO should be detected
+        "-DnvrtcGetTARGET=nvrtcGetCUBIN", #TODO should be detected
+        "-DnvrtcGetTARGETSize=nvrtcGetCUBINSize", #TODO should be detected
+        '-DARCHTAG="sm"', #TODO should be detected
     ]
 
     compile(
diff --git a/keopscore/keopscore/windows_compilations/cuda_detection.py b/keopscore/keopscore/windows_compilations/cuda_detection.py
index c6de764c3..62f56af40 100644
--- a/keopscore/keopscore/windows_compilations/cuda_detection.py
+++ b/keopscore/keopscore/windows_compilations/cuda_detection.py
@@ -2,8 +2,9 @@
 from ctypes.util import find_library
 from pathlib import Path
 
-cuda_available = "CUDA_PATH" in os.environ
+from cuda import pathfinder
 
+cuda_available = "CUDA_PATH" in os.environ
 
 def detect_cuda_toolkit():
 
@@ -14,12 +15,13 @@ def detect_cuda_toolkit():
         cuda_path = Path(
             os.environ["CUDA_PATH"]
         )  # base path for cuda installation (including bin, lib, include, etc.)
-        cuda_bin = Path(cuda_path, "bin")  # where the dlls are located
 
-        if find_library("nvcuda") is not None:
+        if find_library("nvcuda") is not None: # NVCUDA is the main CUDA driver library
             output["dll_cuda"] = find_library("nvcuda")
 
-        for file in cuda_bin.iterdir():
+        cuda_dlls_dir = Path(pathfinder.load_nvidia_dynamic_lib("cudart").abs_path).parent
+
+        for file in cuda_dlls_dir.iterdir():
 
             if file.name.startswith("cudart") and file.name.endswith(".dll"):
                 output["dll_cudart"] = str(file)
diff --git a/keopscore/keopscore/windows_compilations/cuda_detection/CMakeLists.txt b/keopscore/keopscore/windows_compilations/cuda_detection/CMakeLists.txt
new file mode 100644
index 000000000..30cca9752
--- /dev/null
+++ b/keopscore/keopscore/windows_compilations/cuda_detection/CMakeLists.txt
@@ -0,0 +1,127 @@
+cmake_minimum_required(VERSION 3.21)
+project(cuda_detection LANGUAGES CXX)
+
+# If you have multiple toolkits, pass -DCUDAToolkit_ROOT="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.6"
+find_package(CUDAToolkit REQUIRED)
+
+# --- Derive include and lib dirs ------------------------------------------------
+# include_dir: usually one path
+list(GET CUDAToolkit_INCLUDE_DIRS 0 CUDA_DETECT_INCLUDE_DIR)
+
+# lib_dirs: prefer .../lib/x64 if present, else CUDAToolkit_LIBRARY_DIR
+set(CUDA_DETECT_LIB_DIR "${CUDAToolkit_LIBRARY_DIR}")
+if(EXISTS "${CUDAToolkit_LIBRARY_DIR}/x64")
+  set(CUDA_DETECT_LIB_DIR "${CUDAToolkit_LIBRARY_DIR}/x64")
+endif()
+
+# bin dir for DLLs
+set(_BIN "${CUDAToolkit_BIN_DIR}")
+
+# --- Version tag for NVRTC-builtins (12.6 -> 126) ------------------------------
+string(REPLACE "." ";" _ver_list "${CUDAToolkit_VERSION}")
+list(GET _ver_list 0 _maj)
+list(LENGTH _ver_list _len)
+if(_len GREATER 1)
+  list(GET _ver_list 1 _min)
+else()
+  set(_min 0)
+endif()
+math(EXPR _TAG "${_maj} * 10 + ${_min}")
+
+# --- dll_cuda (driver DLL) -----------------------------------------------------
+# Prefer System32 (64-bit)
+find_file(CUDA_DETECT_DLL_CUDA
+  NAMES nvcuda.dll
+  PATHS "C:/Windows/System32" "C:/Windows/SysWOW64"  # fallback shows 32-bit if only that exists
+  NO_DEFAULT_PATH)
+
+# --- dll_cudart (runtime DLL) --------------------------------------------------
+# Try versioned names first, then any cudart64*.dll in the toolkit bin
+set(CUDA_DETECT_DLL_CUDART "")
+foreach(_cand
+  "cudart64_${_maj}${_min}.dll"
+  "cudart64.dll")
+  find_file(CUDA_DETECT_DLL_CUDART NAMES "${_cand}" HINTS "${_BIN}" NO_DEFAULT_PATH)
+  if(CUDA_DETECT_DLL_CUDART)
+    break()
+  endif()
+endforeach()
+if(NOT CUDA_DETECT_DLL_CUDART)
+  file(GLOB _cudart_glob LIST_DIRECTORIES OFF "${_BIN}/cudart64*.dll")
+  if(_cudart_glob)
+    list(GET _cudart_glob 0 CUDA_DETECT_DLL_CUDART)
+  endif()
+endif()
+
+# --- dll_nvrtc (you asked for the *builtins* DLL specifically) -----------------
+# Search for nvrtc-builtins matching our tag, else first available
+set(CUDA_DETECT_DLL_NVRTC "")
+file(GLOB _builtins_glob LIST_DIRECTORIES OFF "${_BIN}/nvrtc-builtins*.dll")
+if(_builtins_glob)
+  foreach(_dll IN LISTS _builtins_glob)
+    get_filename_component(_name "${_dll}" NAME)
+    if(_name MATCHES "nvrtc-builtins.*${_TAG}")
+      set(CUDA_DETECT_DLL_NVRTC "${_dll}")
+      break()
+    endif()
+  endforeach()
+  if(NOT CUDA_DETECT_DLL_NVRTC)
+    list(GET _builtins_glob 0 CUDA_DETECT_DLL_NVRTC)
+  endif()
+endif()
+
+# --- Sanity hints (optional warnings) -----------------------------------------
+if(NOT CUDA_DETECT_DLL_CUDA)
+  message(WARNING "nvcuda.dll not found in System32/SysWOW64.")
+endif()
+if(NOT CUDA_DETECT_DLL_CUDART)
+  message(WARNING "cudart64*.dll not found in ${_BIN}.")
+endif()
+if(NOT CUDA_DETECT_DLL_NVRTC)
+  message(WARNING "nvrtc-builtins*.dll not found in ${_BIN}.")
+endif()
+
+# --- Produce outputs in your desired schema -----------------------------------
+# lib_names are the import-library basenames
+set(_LIB_NAMES_PY "{'cuda': 'cuda', 'cudart': 'cudart', 'nvrtc': 'nvrtc'}")
+
+# Escape backslashes for Python/JSON strings
+function(_escape_for_py in_var out_var)
+  string(REPLACE "\\" "\\\\" _tmp "${${in_var}}")
+  set(${out_var} "${_tmp}" PARENT_SCOPE)
+endfunction()
+
+_escape_for_py(CUDA_DETECT_DLL_CUDA   _DLL_CUDA_PY)
+_escape_for_py(CUDA_DETECT_DLL_CUDART _DLL_CUDART_PY)
+_escape_for_py(CUDA_DETECT_DLL_NVRTC  _DLL_NVRTC_PY)
+_escape_for_py(CUDA_DETECT_INCLUDE_DIR _INCLUDE_PY)
+_escape_for_py(CUDA_DETECT_LIB_DIR     _LIBDIR_PY)
+
+# Python-style dict (exact key names as you requested)
+set(_DICT_PY
+  "{'dll_cuda': '${_DLL_CUDA_PY}', "
+  "'dll_cudart': '${_DLL_CUDART_PY}', "
+  "'dll_nvrtc': '${_DLL_NVRTC_PY}', "
+  "'include_dir': '${_INCLUDE_PY}', "
+  "'lib_dirs': '${_LIBDIR_PY}', "
+  "'lib_names': ${_LIB_NAMES_PY}}"
+)
+
+message(STATUS "CUDA_DETECTION=${_DICT_PY}")
+
+# Also emit a JSON file (cuda_detect.json) in the build dir for programmatic use
+# Note: using the same values; JSON requires quoted keys and escaped paths.
+set(_JSON
+  "{\n"
+  "  \"dll_cuda\": \"${_DLL_CUDA_PY}\",\n"
+  "  \"dll_cudart\": \"${_DLL_CUDART_PY}\",\n"
+  "  \"dll_nvrtc\": \"${_DLL_NVRTC_PY}\",\n"
+  "  \"include_dir\": \"${_INCLUDE_PY}\",\n"
+  "  \"lib_dirs\": \"${_LIBDIR_PY}\",\n"
+  "  \"lib_names\": {\"cuda\": \"cuda\", \"cudart\": \"cudart\", \"nvrtc\": \"nvrtc\"}\n"
+  "}\n")
+file(WRITE "${CMAKE_BINARY_DIR}/cuda_detect.json" "${_JSON}")
+
+# Dummy target so configure+build does something
+add_custom_target(show_detect ALL
+  COMMENT "CUDA detection complete. See message above or cuda_detect.json.")
diff --git a/keopscore/setup.py b/keopscore/setup.py
index a68664515..639e1b565 100644
--- a/keopscore/setup.py
+++ b/keopscore/setup.py
@@ -94,6 +94,9 @@
             "windows_compilations/templates/CMakeLists.txt",
         ],
     },
-    install_requires=[],
+    install_requires=[
+        "cuda-pathfinder",
+        "cmake"
+    ],
     extras_require={},
 )

From 58cb294b18e32b4c59b8553c7407a1712fb8e8d7 Mon Sep 17 00:00:00 2001
From: Louis Pujol <52413616+Louis-Pujol@users.noreply.github.com>
Date: Fri, 19 Sep 2025 17:27:39 +0200
Subject: [PATCH 23/25] lint

---
 .../windows_compilations/compile_pykeops_nvrtc.py    | 12 ++++++------
 .../keopscore/windows_compilations/cuda_detection.py |  7 +++++--
 keopscore/setup.py                                   |  5 +----
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/keopscore/keopscore/windows_compilations/compile_pykeops_nvrtc.py b/keopscore/keopscore/windows_compilations/compile_pykeops_nvrtc.py
index 01478569d..361f00362 100644
--- a/keopscore/keopscore/windows_compilations/compile_pykeops_nvrtc.py
+++ b/keopscore/keopscore/windows_compilations/compile_pykeops_nvrtc.py
@@ -15,12 +15,12 @@ def compile_pykeops_nvrtc(build_folder):
     source_file = pykeops_dir / "common" / "keops_io" / "pykeops_nvrtc_win.cpp"
 
     macros = [
-        "-DMAXIDGPU=0", #TODO should be detected
-        "-DMAXTHREADSPERBLOCK0=1024", #TODO should be detected
-        "-DSHAREDMEMPERBLOCK0=49152", #TODO should be detected
-        "-DnvrtcGetTARGET=nvrtcGetCUBIN", #TODO should be detected
-        "-DnvrtcGetTARGETSize=nvrtcGetCUBINSize", #TODO should be detected
-        '-DARCHTAG="sm"', #TODO should be detected
+        "-DMAXIDGPU=0",  # TODO should be detected
+        "-DMAXTHREADSPERBLOCK0=1024",  # TODO should be detected
+        "-DSHAREDMEMPERBLOCK0=49152",  # TODO should be detected
+        "-DnvrtcGetTARGET=nvrtcGetCUBIN",  # TODO should be detected
+        "-DnvrtcGetTARGETSize=nvrtcGetCUBINSize",  # TODO should be detected
+        '-DARCHTAG="sm"',  # TODO should be detected
     ]
 
     compile(
diff --git a/keopscore/keopscore/windows_compilations/cuda_detection.py b/keopscore/keopscore/windows_compilations/cuda_detection.py
index 62f56af40..18554bb62 100644
--- a/keopscore/keopscore/windows_compilations/cuda_detection.py
+++ b/keopscore/keopscore/windows_compilations/cuda_detection.py
@@ -6,6 +6,7 @@
 
 cuda_available = "CUDA_PATH" in os.environ
 
+
 def detect_cuda_toolkit():
 
     output = {}
@@ -16,10 +17,12 @@ def detect_cuda_toolkit():
             os.environ["CUDA_PATH"]
         )  # base path for cuda installation (including bin, lib, include, etc.)
 
-        if find_library("nvcuda") is not None: # NVCUDA is the main CUDA driver library
+        if find_library("nvcuda") is not None:  # NVCUDA is the main CUDA driver library
             output["dll_cuda"] = find_library("nvcuda")
 
-        cuda_dlls_dir = Path(pathfinder.load_nvidia_dynamic_lib("cudart").abs_path).parent
+        cuda_dlls_dir = Path(
+            pathfinder.load_nvidia_dynamic_lib("cudart").abs_path
+        ).parent
 
         for file in cuda_dlls_dir.iterdir():
 
diff --git a/keopscore/setup.py b/keopscore/setup.py
index 639e1b565..e4fca04d2 100644
--- a/keopscore/setup.py
+++ b/keopscore/setup.py
@@ -94,9 +94,6 @@
             "windows_compilations/templates/CMakeLists.txt",
         ],
     },
-    install_requires=[
-        "cuda-pathfinder",
-        "cmake"
-    ],
+    install_requires=["cuda-pathfinder", "cmake"],
     extras_require={},
 )

From 7ba96196367446b0d528920b92d2c47d3ad489c5 Mon Sep 17 00:00:00 2001
From: Louis Pujol <Louis.Pujol@3ds.com>
Date: Fri, 19 Sep 2025 17:51:09 +0200
Subject: [PATCH 24/25] remove cuda pathfinder, check x86 folder for cuda dlls

---
 .../windows_compilations/cuda_detection.py    | 54 ++++++++++++++-----
 keopscore/setup.py                            |  2 +-
 2 files changed, 41 insertions(+), 15 deletions(-)

diff --git a/keopscore/keopscore/windows_compilations/cuda_detection.py b/keopscore/keopscore/windows_compilations/cuda_detection.py
index 18554bb62..1a8e7db11 100644
--- a/keopscore/keopscore/windows_compilations/cuda_detection.py
+++ b/keopscore/keopscore/windows_compilations/cuda_detection.py
@@ -1,9 +1,21 @@
+r"""
+CUDA toolkit detection on Windows.
+
+CUDA_PATH environment variable must be set. It is usually set by the CUDA installer, if not it
+must point to a valid CUDA installation (typically C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\vX.Y).
+
+The detection looks for the following files:
+- cudart*.dll
+- nvrtc-builtins*.dll
+- nvcuda.dll (CUDA driver library, usually located in system32 folder)
+- include directory
+- lib/x64 directory containing cuda.lib, nvrtc.lib and cudart.lib
+"""
+
 import os
 from ctypes.util import find_library
 from pathlib import Path
 
-from cuda import pathfinder
-
 cuda_available = "CUDA_PATH" in os.environ
 
 
@@ -20,19 +32,30 @@ def detect_cuda_toolkit():
         if find_library("nvcuda") is not None:  # NVCUDA is the main CUDA driver library
             output["dll_cuda"] = find_library("nvcuda")
 
-        cuda_dlls_dir = Path(
-            pathfinder.load_nvidia_dynamic_lib("cudart").abs_path
-        ).parent
-
-        for file in cuda_dlls_dir.iterdir():
-
-            if file.name.startswith("cudart") and file.name.endswith(".dll"):
-                output["dll_cudart"] = str(file)
-
-            if file.name.startswith("nvrtc-builtins") and file.name.endswith(".dll"):
-                output["dll_nvrtc"] = str(file)
+        cuda_path = Path(
+            os.environ["CUDA_PATH"]
+        )  # base path for cuda installation (including bin, lib, include, etc.)
 
-        # See the files in Path(cuda_path, "cmake") for something more automatic here
+        #################################################
+        # Detect relevant DLLs: cudart and nvrtc-builtins
+        #################################################
+
+        # Check both bin and bin/x86 directories for relevant DLLs
+        bin_dirs = [Path(cuda_path, "bin"), Path(cuda_path, "bin", "x86")]
+
+        for bin_dir in bin_dirs:
+            if bin_dir.is_dir():
+                for file in bin_dir.iterdir():
+                    if file.name.startswith("cudart") and file.name.endswith(".dll"):
+                        output["dll_cudart"] = str(file)
+                    if file.name.startswith("nvrtc-builtins") and file.name.endswith(
+                        ".dll"
+                    ):
+                        output["dll_nvrtc"] = str(file)
+
+        #################################################
+        # Detect include and lib directories
+        #################################################
         cuda_include = Path(cuda_path, "include")
         if cuda_include.is_dir():
             output["include_dir"] = str(cuda_include)
@@ -41,6 +64,9 @@ def detect_cuda_toolkit():
         if cuda_libs.is_dir():
             output["lib_dirs"] = str(cuda_libs)
 
+        #################################################
+        # Make sure that cudart, nvrtc and cuda libs are available
+        #################################################
         output["lib_names"] = {}
         for key in ["cuda", "nvrtc", "cudart"]:
 
diff --git a/keopscore/setup.py b/keopscore/setup.py
index e4fca04d2..77c9eaaa8 100644
--- a/keopscore/setup.py
+++ b/keopscore/setup.py
@@ -94,6 +94,6 @@
             "windows_compilations/templates/CMakeLists.txt",
         ],
     },
-    install_requires=["cuda-pathfinder", "cmake"],
+    install_requires=["cmake"],
     extras_require={},
 )

From e890b208a4b5fd8e4e69d6c4314ee6656797ed94 Mon Sep 17 00:00:00 2001
From: Louis Pujol <Louis.Pujol@3ds.com>
Date: Fri, 19 Sep 2025 17:57:03 +0200
Subject: [PATCH 25/25] typo x86 -> x64

---
 keopscore/keopscore/windows_compilations/cuda_detection.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/keopscore/keopscore/windows_compilations/cuda_detection.py b/keopscore/keopscore/windows_compilations/cuda_detection.py
index 1a8e7db11..3f4b8f63d 100644
--- a/keopscore/keopscore/windows_compilations/cuda_detection.py
+++ b/keopscore/keopscore/windows_compilations/cuda_detection.py
@@ -40,8 +40,8 @@ def detect_cuda_toolkit():
         # Detect relevant DLLs: cudart and nvrtc-builtins
         #################################################
 
-        # Check both bin and bin/x86 directories for relevant DLLs
-        bin_dirs = [Path(cuda_path, "bin"), Path(cuda_path, "bin", "x86")]
+        # Check both bin and bin/x64 directories for relevant DLLs
+        bin_dirs = [Path(cuda_path, "bin"), Path(cuda_path, "bin", "x64")]
 
         for bin_dir in bin_dirs:
             if bin_dir.is_dir():