diff --git a/keopscore/keopscore/binders/LinkCompile.py b/keopscore/keopscore/binders/LinkCompile.py
index aae5d70ea..d1ba38058 100644
--- a/keopscore/keopscore/binders/LinkCompile.py
+++ b/keopscore/keopscore/binders/LinkCompile.py
@@ -84,7 +84,11 @@ def read_info(self):
     def write_code(self):
         # write the generated code in the source file ; this is used as a subfunction of compile_code
         f = open(self.gencode_file, "w")
-        f.write(self.code)
+        if os.name == "nt":
+            f.write(self.code.replace("signed long int", "int"))
+        else:
+            f.write(self.code)
+
         f.close()
 
     def generate_code(self):
diff --git a/keopscore/keopscore/binders/nvrtc/Gpu_link_compile.py b/keopscore/keopscore/binders/nvrtc/Gpu_link_compile.py
index ccd6eb00d..4833cec95 100644
--- a/keopscore/keopscore/binders/nvrtc/Gpu_link_compile.py
+++ b/keopscore/keopscore/binders/nvrtc/Gpu_link_compile.py
@@ -1,6 +1,5 @@
 import os
 from ctypes import create_string_buffer, CDLL, c_int
-from os import RTLD_LAZY
 import sysconfig
 from os.path import join
 
@@ -34,10 +33,13 @@
 
 
 def jit_compile_dll():
-    return os.path.join(
-        build_folder,
-        "nvrtc_jit" + sysconfig.get_config_var("SHLIB_SUFFIX"),
-    )
+    if os.name == "nt":
+        return os.path.join(build_folder, "nvrtc_jit.dll")
+    else:
+        return os.path.join(
+            build_folder,
+            "nvrtc_jit" + sysconfig.get_config_var("SHLIB_SUFFIX"),
+        )
 
 
 class Gpu_link_compile(LinkCompile):
@@ -61,7 +63,10 @@ def __init__(self):
             self.low_level_code_prefix + self.gencode_filename,
         ).encode("utf-8")
 
-        self.my_c_dll = CDLL(jit_compile_dll(), mode=RTLD_LAZY)
+        if os.name != "nt":
+            self.my_c_dll = CDLL(jit_compile_dll(), mode=os.RTLD_LAZY)
+        else:
+            self.my_c_dll = CDLL(jit_compile_dll())
         # actual dll to be called is the jit binary, TODO: check if this is relevent
         self.true_dllname = jit_binary
         # file to check for existence to detect compilation is needed
@@ -75,16 +80,27 @@ def generate_code(self):
         self.write_code()
         # we execute the main dll, passing the code as argument, and the name of the low level code file to save the assembly instructions
 
-        res = self.my_c_dll.Compile(
-            create_string_buffer(self.low_level_code_file),
-            create_string_buffer(self.code.encode("utf-8")),
-            c_int(self.use_half),
-            c_int(self.use_fast_math),
-            c_int(self.device_id),
-            create_string_buffer(
-                (custom_cuda_include_fp16_path() + os.path.sep).encode("utf-8")
-            ),
-        )
+        if os.name != "nt":
+            res = self.my_c_dll.Compile(
+                create_string_buffer(self.low_level_code_file),
+                create_string_buffer(self.code.encode("utf-8")),
+                c_int(self.use_half),
+                c_int(self.use_fast_math),
+                c_int(self.device_id),
+                create_string_buffer(
+                    (custom_cuda_include_fp16_path() + os.path.sep).encode("utf-8")
+                ),
+            )
+        else:
+            res = self.my_c_dll.Compile(
+                create_string_buffer(self.low_level_code_file),
+                create_string_buffer(self.code.encode("utf-8")),
+                c_int(self.use_half),
+                c_int(self.device_id),
+                create_string_buffer(
+                    (custom_cuda_include_fp16_path() + os.path.sep).encode("utf-8")
+                ),
+            )
         if res != 0:
             KeOps_Error(
                 f"Error when compiling formula (error in nvrtcCompileProgram, nvrtcResult={res})"
@@ -116,8 +132,14 @@ def get_compile_command(
     @staticmethod
     def compile_jit_compile_dll():
         KeOps_Message("Compiling cuda jit compiler engine ... ", flush=True, end="")
-        command = Gpu_link_compile.get_compile_command(
-            sourcename=jit_compile_src, dllname=jit_compile_dll()
-        )
-        KeOps_OS_Run(command)
+        if os.name == "nt":
+            from ...windows_compilations import compile_nvrtc_jit
+
+            compile_nvrtc_jit(build_folder=build_folder)
+        else:
+            command = Gpu_link_compile.get_compile_command(
+                sourcename=jit_compile_src, dllname=jit_compile_dll()
+            )
+            KeOps_OS_Run(command)
+
         KeOps_Message("OK", use_tag=False, flush=True)
diff --git a/keopscore/keopscore/binders/nvrtc/keops_nvrtc_win.cpp b/keopscore/keopscore/binders/nvrtc/keops_nvrtc_win.cpp
new file mode 100644
index 000000000..017201363
--- /dev/null
+++ b/keopscore/keopscore/binders/nvrtc/keops_nvrtc_win.cpp
@@ -0,0 +1,604 @@
+
+// nvcc -shared -Xcompiler -fPIC -lnvrtc -lcuda keops_nvrtc.cu -o keops_nvrtc.so
+// g++ --verbose -L/opt/cuda/lib64 -L/opt/cuda/targets/x86_64-linux/lib/ -I/opt/cuda/targets/x86_64-linux/include/ -I../../include -shared -fPIC -lcuda -lnvrtc -fpermissive -DMAXIDGPU=0 -DMAXTHREADSPERBLOCK0=1024 -DSHAREDMEMPERBLOCK0=49152 -DnvrtcGetTARGET=nvrtcGetCUBIN -DnvrtcGetTARGETSize=nvrtcGetCUBINSize -DARCHTAG=\"sm\" keops_nvrtc.cpp -o keops_nvrtc.so
+// g++ -std=c++11  -shared -fPIC -O3 -fpermissive -L /usr/lib -L /opt/cuda/lib64 -lcuda -lnvrtc -DnvrtcGetTARGET=nvrtcGetCUBIN -DnvrtcGetTARGETSize=nvrtcGetCUBINSize -DARCHTAG=\"sm\"  -I/home/bcharlier/projets/keops/keops/keops/include -I/opt/cuda/include -I/usr/include/python3.10/ -DMAXIDGPU=0 -DMAXTHREADSPERBLOCK0=1024 -DSHAREDMEMPERBLOCK0=49152  /home/bcharlier/projets/keops/keops/keops/binders/nvrtc/keops_nvrtc.cpp -o keops_nvrtc.cpython-310-x86_64-linux-gnu.so
+
+#include <nvrtc.h>
+#include <cuda.h>
+#include <stdio.h>
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <stdarg.h>
+#include <vector>
+#include <numeric>
+//#include <ctime>
+
+#define C_CONTIGUOUS 1
+#define USE_HALF 0
+
+#include "include/Sizes_win.h"
+#include "include/Ranges_win.h"
+#include "include/utils_pe_win.h"
+#include "include/ranges_utils_win.h"
+
+
+#include "include/CudaSizes_win.h"
+#include <cuda_fp16.h>
+
+
+int *build_offset_tables(int nbatchdims, int *shapes, int nblocks, int *lookup_h,
+                         const std::vector< int > &indsi,
+                         const std::vector< int > &indsj,
+                         const std::vector< int > &indsp,
+                         int tagJ) {
+
+    int sizei = indsi.size();
+    int sizej = indsj.size();
+    int sizep = indsp.size();
+
+    // Support for broadcasting over batch dimensions =============================================
+
+    int sizevars = sizei + sizej + sizep;
+
+    // Separate and store the shapes of the "i" and "j" variables + parameters --------------
+    //
+    // shapes is an array of size (1+nargs)*(nbatchdims+3), which looks like:
+    // [ A, .., B, M, N, D_out]  -> output
+    // [ A, .., B, M, 1, D_1  ]  -> "i" variable
+    // [ A, .., B, 1, N, D_2  ]  -> "j" variable
+    // [ A, .., B, 1, 1, D_3  ]  -> "parameter"
+    // [ A, .., 1, M, 1, D_4  ]  -> N.B.: we support broadcasting on the batch dimensions!
+    // [ 1, .., 1, M, 1, D_5  ]  ->      (we'll just ask users to fill in the shapes with *explicit* ones)
+
+    //int shapes_i[sizei * (nbatchdims + 1)], shapes_j[sizej * (nbatchdims + 1)], shapes_p[sizep * (nbatchdims + 1)];
+    std::vector<int> shapes_i(sizei * (nbatchdims + 1));
+    std::vector<int> shapes_j(sizej * (nbatchdims + 1));
+    std::vector<int> shapes_p(sizep * (nbatchdims + 1));
+
+    // First, we fill shapes_i with the "relevant" shapes of the "i" variables,
+    // making it look like, say:
+    // [ A, .., B, M]
+    // [ A, .., 1, M]
+    // [ A, .., A, M]
+    // Then, we do the same for shapes_j, but with "N" instead of "M".
+    // And finally for the parameters, with "1" instead of "M".
+    fill_shapes(nbatchdims, shapes, shapes_i.data(), shapes_j.data(), shapes_p.data(), tagJ, indsi, indsj, indsp);
+
+    int tagIJ = tagJ; // 1 if the reduction is made "over j", 0 if it is made "over i"
+	int M = shapes[nbatchdims], N = shapes[nbatchdims + 1];
+
+    // We create a lookup table, "offsets", of shape (nblocks, SIZEVARS) --------
+    int *offsets_d = NULL;
+
+	//int offsets_h[nblocks * sizevars];
+    std::vector<int> offsets_h(nblocks * sizevars);
+
+    for (int k = 0; k < nblocks; k++) {
+        int range_id = (int) lookup_h[3 * k];
+        int start_x = tagIJ ? range_id * M : range_id * N;
+        int start_y = tagIJ ? range_id * N : range_id * M;
+
+        int patch_offset = (int) (lookup_h[3 * k + 1] - start_x);
+
+        vect_broadcast_index(start_x, nbatchdims, sizei, shapes, shapes_i.data(), offsets_h.data() + k * sizevars, patch_offset);
+        vect_broadcast_index(start_y, nbatchdims, sizej, shapes, shapes_j.data(), offsets_h.data() + k * sizevars + sizei);
+        vect_broadcast_index(range_id, nbatchdims, sizep, shapes, shapes_p.data(), offsets_h.data() + k * sizevars + sizei + sizej);
+    }
+
+    CUDA_SAFE_CALL(cuMemAlloc((CUdeviceptr * ) & offsets_d, sizeof(int) * nblocks * sizevars));
+    CUDA_SAFE_CALL(cuMemcpyHtoD((CUdeviceptr) offsets_d, offsets_h.data(), sizeof(int) * nblocks * sizevars));
+
+    return offsets_d;
+}
+
+
+void range_preprocess_from_device(int &nblocks, int tagI, int nranges_x, int nranges_y, int **castedranges,
+                                  int nbatchdims, int *&slices_x_d, int *&ranges_y_d,
+                                  int *&lookup_d, int *&offsets_d, int blockSize_x,
+                                  const std::vector< int > &indsi,
+                                  const std::vector< int > &indsj,
+                                  const std::vector< int > &indsp,
+                                  int *shapes) {
+
+    // Ranges pre-processing... ==================================================================
+
+    // N.B.: In the following code, we assume that the x-ranges do not overlap.
+    //       Otherwise, we'd have to assume that DIMRED == DIMOUT
+    //       or allocate a buffer of size nx * DIMRED. This may be done in the future.
+    // Cf. reduction.h:
+    //    FUN::tagJ = 1 for a reduction over j, result indexed by i
+    //    FUN::tagJ = 0 for a reduction over i, result indexed by j
+
+    int tagJ = 1 - tagI;
+    int nranges = tagJ ? nranges_x : nranges_y;
+
+    int *ranges_x = tagJ ? castedranges[0] : castedranges[3];
+    int *slices_x = tagJ ? castedranges[1] : castedranges[4];
+    int *ranges_y = tagJ ? castedranges[2] : castedranges[5];
+
+	std::vector<int> ranges_x_h_arr(2 * nranges);
+    int* ranges_x_h;
+
+    // The code below needs a pointer to ranges_x on *host* memory,  -------------------
+    // as well as pointers to slices_x and ranges_y on *device* memory.
+    // -> Depending on the "ranges" location, we'll copy ranges_x *or* slices_x and ranges_y
+    //    to the appropriate memory:
+    bool ranges_on_device = (nbatchdims == 0);
+    // N.B.: We only support Host ranges with Device data when these ranges were created
+    //       to emulate block-sparse reductions.
+
+    if (ranges_on_device) {  // The ranges are on the device
+        ranges_x_h = &ranges_x_h_arr[0];
+        // Send data from device to host.
+        cuMemcpyDtoH(ranges_x_h, (CUdeviceptr) ranges_x, sizeof(int) * 2 * nranges);
+        slices_x_d = slices_x;
+        ranges_y_d = ranges_y;
+    } else {  // The ranges are on host memory; this is typically what happens with **batch processing**,
+        // with ranges generated by keops_io.h:
+        ranges_x_h = ranges_x;
+        // Copy "slices_x" to the device:
+        CUDA_SAFE_CALL(cuMemAlloc((CUdeviceptr * ) & slices_x_d, sizeof(int) * nranges));
+        CUDA_SAFE_CALL(cuMemcpyHtoD((CUdeviceptr) slices_x_d, slices_x, sizeof(int) * nranges));
+
+        // Copy "redranges_y" to the device: with batch processing, we KNOW that they have the same shape as ranges_x
+        CUDA_SAFE_CALL(cuMemAlloc((CUdeviceptr * ) & ranges_y_d, sizeof(int) * 2 * nranges));
+        CUDA_SAFE_CALL(cuMemcpyHtoD((CUdeviceptr) ranges_y_d, ranges_y, sizeof(int) * 2 * nranges));
+    }
+
+    // Computes the number of blocks needed ---------------------------------------------
+    nblocks = 0;
+    int len_range = 0;
+    for (int i = 0; i < nranges; i++) {
+        len_range = ranges_x_h[2 * i + 1] - ranges_x_h[2 * i];
+        nblocks += (len_range / blockSize_x) + (len_range % blockSize_x == 0 ? 0 : 1);
+    }
+
+    // Create a lookup table for the blocks --------------------------------------------
+    std::vector<int> lookup_h(3 * nblocks);
+    int index = 0;
+
+    for (int i = 0; i < nranges; i++) {
+        len_range = ranges_x_h[2 * i + 1] - ranges_x_h[2 * i];
+        for (int j = 0; j < len_range; j += blockSize_x) {
+            lookup_h[3 * index] = i;
+            lookup_h[3 * index + 1] = ranges_x_h[2 * i] + j;
+            lookup_h[3 * index + 2] = ranges_x_h[2 * i] + j + std::min((int) blockSize_x, len_range - j);
+            index++;
+        }
+    }
+
+    // Load the table on the device -----------------------------------------------------
+    CUDA_SAFE_CALL(cuMemAlloc((CUdeviceptr * ) &lookup_d, sizeof(int) * 3 * nblocks));
+    CUDA_SAFE_CALL(cuMemcpyHtoD((CUdeviceptr) lookup_d, lookup_h.data(), sizeof(int) * 3 * nblocks));
+
+
+    // Support for broadcasting over batch dimensions =============================================
+
+    // We create a lookup table, "offsets", of shape (nblock, SIZEVARS):
+
+    if (nbatchdims > 0) {
+        offsets_d = build_offset_tables(nbatchdims, shapes, nblocks, lookup_h.data(),
+                                        indsi, indsj, indsp, tagJ);
+    }
+
+
+}
+
+
+void
+range_preprocess_from_host(int &nblocks, int tagI, int nranges_x, int nranges_y, int nredranges_x, int nredranges_y,
+                           int **castedranges,
+                           int nbatchdims, int *&slices_x_d, int *&ranges_y_d,
+                           int *&lookup_d, int *&offsets_d, int blockSize_x,
+                           const std::vector< int > &indsi,
+                           const std::vector< int > &indsj,
+                           const std::vector< int > &indsp,
+                           int *shapes) {
+
+    // Ranges pre-processing... ==================================================================
+
+    // N.B.: In the following code, we assume that the x-ranges do not overlap.
+    //       Otherwise, we'd have to assume that DIMRED == DIMOUT
+    //       or allocate a buffer of size nx * DIMRED. This may be done in the future.
+    // Cf. reduction.h:
+    //    FUN::tagJ = 1 for a reduction over j, result indexed by i
+    //    FUN::tagJ = 0 for a reduction over i, result indexed by j
+
+    int tagJ = 1 - tagI;
+    int nranges = tagJ ? nranges_x : nranges_y;
+    int nredranges = tagJ ? nredranges_y : nredranges_x;
+
+    int *ranges_x = tagJ ? castedranges[0] : castedranges[3];
+    int *slices_x = tagJ ? castedranges[1] : castedranges[4];
+    int *ranges_y = tagJ ? castedranges[2] : castedranges[5];
+
+    // Computes the number of blocks needed ---------------------------------------------
+    nblocks = 0;
+    int len_range = 0;
+    for (int i = 0; i < nranges; i++) {
+        len_range = ranges_x[2 * i + 1] - ranges_x[2 * i];
+        nblocks += (len_range / blockSize_x) + (len_range % blockSize_x == 0 ? 0 : 1);
+    }
+
+    // Create a lookup table for the blocks --------------------------------------------
+    std::vector<int> lookup_h(3 * nblocks);
+    int index = 0;
+
+    for (int i = 0; i < nranges; i++) {
+        len_range = ranges_x[2 * i + 1] - ranges_x[2 * i];
+        for (int j = 0; j < len_range; j += blockSize_x) {
+            lookup_h[3 * index] = i;
+            lookup_h[3 * index + 1] = ranges_x[2 * i] + j;
+            lookup_h[3 * index + 2] = ranges_x[2 * i] + j + std::min((int) blockSize_x, len_range - j);
+            index++;
+        }
+    }
+
+    // Load the table on the device -----------------------------------------------------
+    CUDA_SAFE_CALL(cuMemAlloc((CUdeviceptr * ) & lookup_d, sizeof(int) * 3 * nblocks));
+    CUDA_SAFE_CALL(cuMemcpyHtoD((CUdeviceptr) lookup_d, lookup_h.data(), sizeof(int) * 3 * nblocks));
+
+    // Send data from host to device:
+    CUDA_SAFE_CALL(cuMemAlloc((CUdeviceptr * ) & slices_x_d, sizeof(int) * 2 * nranges));
+    CUDA_SAFE_CALL(cuMemcpyHtoD((CUdeviceptr) slices_x_d, slices_x, sizeof(int) * 2 * nranges));
+
+    CUDA_SAFE_CALL(cuMemAlloc((CUdeviceptr * ) & ranges_y_d, sizeof(int) * 2 * nredranges));
+    CUDA_SAFE_CALL(cuMemcpyHtoD((CUdeviceptr) ranges_y_d, ranges_y, sizeof(int) * 2 * nredranges));
+
+
+    // Support for broadcasting over batch dimensions =============================================
+
+    // We create a lookup table, "offsets", of shape (nblock, SIZEVARS):
+
+    if (nbatchdims > 0) {
+        offsets_d = build_offset_tables(nbatchdims, shapes, nblocks, lookup_h.data(),
+                                        indsi, indsj, indsp, tagJ);
+    }
+
+
+}
+
+
+template< typename TYPE >
+class KeOps_module {
+public :
+
+    CUdevice cuDevice;
+    CUcontext ctx;
+    CUmodule module;
+    char *target;
+    CUdeviceptr buffer;
+    int nargs;
+
+    void SetContext() {
+        CUcontext current_ctx;
+        CUDA_SAFE_CALL_NO_EXCEPTION(cuCtxGetCurrent(&current_ctx));
+        if (current_ctx != ctx)
+            CUDA_SAFE_CALL_NO_EXCEPTION(cuCtxPushCurrent(ctx));
+        CUDA_SAFE_CALL_NO_EXCEPTION(cuCtxGetCurrent(&current_ctx));
+    }
+
+
+    void Read_Target(const char *target_file_name) {
+        std::ifstream rf(target_file_name, std::ifstream::binary);
+        size_t targetSize;
+        rf.read((char *) &targetSize, sizeof(size_t));
+        target = new char[targetSize];
+        rf.read(target, targetSize);
+        rf.close();
+
+    }
+
+
+    KeOps_module(int device_id, int nargs_, const char *target_file_name) {
+
+        nargs = nargs_;
+
+        // init cuda in case not already done
+        CUDA_SAFE_CALL(cuInit(0));
+
+        // get the device and the primary context corresponding to device_id
+        CUDA_SAFE_CALL(cuDeviceGet(&cuDevice, device_id));
+        CUDA_SAFE_CALL(cuDevicePrimaryCtxRetain(&ctx, cuDevice));
+
+        // set the primary context as the active current context
+        SetContext();
+
+        // set global variables giving some properties of device
+        SetGpuProps(device_id);
+
+        // read the ptx or cubin file into a char array
+        Read_Target(target_file_name);
+
+        // load the corresponding module
+        CUDA_SAFE_CALL(cuModuleLoadDataEx(&module, target, 0, NULL, NULL));
+
+        // allocate a small memory buffer for "on device" computation mode,
+        // This is just used for storing the list of pointers to device data
+        // as a device array ; it is better to allocate it here once for all,
+        // otherwise allocating it at each call may cause a small overhead.
+        CUDA_SAFE_CALL(cuMemAlloc(&buffer, nargs * sizeof(TYPE *)));
+
+    }
+
+
+    ~KeOps_module() {
+        SetContext();
+        CUDA_SAFE_CALL_NO_EXCEPTION(cuMemFree(buffer));
+        CUDA_SAFE_CALL_NO_EXCEPTION(cuModuleUnload(module));
+        CUDA_SAFE_CALL_NO_EXCEPTION(cuDevicePrimaryCtxRelease(cuDevice));
+        delete[] target;
+    }
+
+    int launch_kernel(int tagHostDevice, int dimY, int nx, int ny,
+                      int tagI, int tagZero, int use_half,
+                      int tag1D2D, int dimred,
+                      int cuda_block_size, int use_chunk_mode,
+                      std::vector< int > indsi, std::vector< int > indsj, std::vector< int > indsp,
+                      int dimout,
+                      std::vector< int > dimsx, std::vector< int > dimsy, std::vector< int > dimsp,
+                      int **ranges,
+                      std::vector< int > shapeout, TYPE *out,
+                      TYPE **arg,
+                      std::vector <std::vector< int >> argshape
+    ) {
+
+
+        SetContext();
+
+        ////end_ = clock();
+        ////std::cout << "  time for set device : " << double(//end_ - start_) / CLOCKS_PER_SEC << std::endl;
+        //start_ = clock();
+
+        Sizes <TYPE> SS(nargs, arg, argshape, nx, ny,
+                        tagI, use_half,
+                        dimout,
+                        indsi, indsj, indsp,
+                        dimsx, dimsy, dimsp);
+
+        //end_ = clock();
+        //std::cout << "  time for Sizes : " << double(//end_ - start_) / CLOCKS_PER_SEC << std::endl;
+        //start_ = clock();
+
+        if (use_half)
+            SS.switch_to_half2_indexing();
+
+        Ranges <TYPE> RR(SS, ranges);
+        nx = SS.nx;
+        ny = SS.ny;
+
+        //end_ = clock();
+        //std::cout << "  time for Ranges : " << double(//end_ - start_) / CLOCKS_PER_SEC << std::endl;
+        //start_ = clock();
+
+        // now we switch (back...) indsi, indsj and dimsx, dimsy in case tagI=1.
+        // This is to be consistent with the convention used in the old
+        // bindings where i and j variables had different meanings in bindings
+        // and in the core code. Clearly we could do better if we
+        // carefully rewrite some parts of the code
+        if (tagI == 1) {
+            std::vector< int > tmp;
+
+            tmp = indsj;
+            indsj = indsi;
+            indsi = tmp;
+
+            tmp = dimsy;
+            dimsy = dimsx;
+            dimsx = tmp;
+        }
+
+
+        int blockSize_x = 1, blockSize_y = 1, blockSize_z = 1;
+
+        if (use_chunk_mode == 0) {
+            // warning : blockSize.x was previously set to CUDA_BLOCK_SIZE; currently CUDA_BLOCK_SIZE value is used as a bound.
+            blockSize_x = std::min(cuda_block_size,
+                                   std::min(maxThreadsPerBlock,
+                                            (int) (sharedMemPerBlock / std::max(1, (int) (dimY * sizeof(TYPE))))
+                                           )
+                                  ); // number of threads in each block
+        } else {
+            // warning : the value here must match the one which is set in file GpuReduc1D_chunks.py, line 59
+            // and file GpuReduc1D_finalchunks.py, line 67
+            blockSize_x = std::min(cuda_block_size,
+                                   std::min(1024, (int) (49152 / std::max(1, (int) (dimY * sizeof(TYPE)))))
+                                  );
+        }
+
+        int nblocks;
+
+        if (tagI == 1) {
+            int tmp = ny;
+            ny = nx;
+            nx = tmp;
+        }
+
+        int *lookup_d = NULL, *slices_x_d = NULL, *ranges_y_d = NULL;
+        int *offsets_d = NULL;
+
+        if (RR.tagRanges == 1) {
+            if (tagHostDevice == 1) {
+                range_preprocess_from_device(nblocks, tagI, RR.nranges_x, RR.nranges_y, RR.castedranges,
+                                             SS.nbatchdims, slices_x_d, ranges_y_d, lookup_d,
+                                             offsets_d,
+                                             blockSize_x, indsi, indsj, indsp, SS.shapes);
+            } else { // tagHostDevice==0
+                range_preprocess_from_host(nblocks, tagI, RR.nranges_x, RR.nranges_y, RR.nredranges_x, RR.nredranges_y,
+                                           RR.castedranges,
+                                           SS.nbatchdims, slices_x_d, ranges_y_d, lookup_d,
+                                           offsets_d,
+                                           blockSize_x, indsi, indsj, indsp, SS.shapes);
+            }
+        }
+
+        ////end_ = clock();
+        ////std::cout << "  time for interm : " << double(//end_ - start_) / CLOCKS_PER_SEC << std::endl;
+        //start_ = clock();
+
+        CUdeviceptr p_data;
+        TYPE *out_d;
+        TYPE **arg_d;
+
+        int sizeout = std::accumulate(shapeout.begin(), shapeout.end(), 1, std::multiplies< int >());
+
+        if (tagHostDevice == 1) {
+            p_data = buffer;
+            load_args_FromDevice(p_data, out, out_d, nargs, arg, arg_d);
+        } else
+            load_args_FromHost(p_data, out, out_d, nargs, arg, arg_d, argshape, sizeout);
+
+        ////end_ = clock();
+        ////std::cout << "  time for load_args : " << double(//end_ - start_) / CLOCKS_PER_SEC << std::endl;
+        //start_ = clock();
+
+        CUfunction kernel;
+
+        int gridSize_x = 1, gridSize_y = 1, gridSize_z = 1;
+
+        if (tag1D2D == 1) { // 2D scheme
+
+            gridSize_x = nx / blockSize_x + (nx % blockSize_x == 0 ? 0 : 1);
+            gridSize_y = ny / blockSize_x + (ny % blockSize_x == 0 ? 0 : 1);
+
+            // Reduce : grid and block are both 1d
+            int blockSize2_x = 1, blockSize2_y = 1, blockSize2_z = 1;
+            blockSize2_x = blockSize_x; // number of threads in each block
+            int gridSize2_x = 1, gridSize2_y = 1, gridSize2_z = 1;
+            gridSize2_x = (nx * dimred) / blockSize2_x + ((nx * dimred) % blockSize2_x == 0 ? 0 : 1);
+
+            // Data on the device. We need an "inflated" outB, which contains gridSize.y "copies" of out
+            // that will be reduced in the final pass.
+            TYPE *outB;
+
+            // single cudaMalloc
+            CUdeviceptr p_data_outB;
+            CUDA_SAFE_CALL(cuMemAlloc(&p_data_outB, sizeof(TYPE) * (nx * dimred * gridSize_y)));
+
+            outB = (TYPE *) ((TYPE **) p_data);
+
+            CUDA_SAFE_CALL(cuModuleGetFunction(&kernel, module, "GpuConv2DOnDevice"));
+
+            void *kernel_params[4];
+            kernel_params[0] = &nx;
+            kernel_params[1] = &ny;
+            kernel_params[2] = &outB;
+            kernel_params[3] = &arg_d;
+
+            // Size of the SharedData : blockSize.x*(DIMY)*sizeof(TYPE)
+
+            CUDA_SAFE_CALL(cuLaunchKernel(kernel,
+                                          gridSize_x, gridSize_y, gridSize_z,      // grid dim
+                                          blockSize_x, blockSize_y, blockSize_z,   // block dim
+                                          blockSize_x * dimY * sizeof(TYPE), NULL, // shared mem and stream
+                                          kernel_params, 0));
+            // block until the device has completed
+            CUDA_SAFE_CALL(cuCtxSynchronize());
+
+            // Since we've used a 2D scheme, there's still a "blockwise" line reduction to make on
+            // the output array px_d[0] = x1B. We go from shape ( gridSize.y * nx, DIMRED ) to (nx, DIMOUT)
+            CUfunction kernel_reduce;
+            CUDA_SAFE_CALL(cuModuleGetFunction(&kernel_reduce, module, "reduce2D"));
+            void *kernel_reduce_params[4];
+            kernel_reduce_params[0] = &outB;
+            kernel_reduce_params[1] = &out_d;
+            kernel_reduce_params[2] = &gridSize_y;
+            kernel_reduce_params[3] = &nx;
+
+            CUDA_SAFE_CALL(cuLaunchKernel(kernel_reduce,
+                                          gridSize2_x, gridSize2_y, gridSize2_z,    // grid dim
+                                          blockSize2_x, blockSize2_y, blockSize2_z,   // block dim
+                                          0, NULL,             // shared mem and stream
+                                          kernel_reduce_params, 0));
+
+
+        } else if (RR.tagRanges == 1 && tagZero == 0) {
+            // ranges mode
+
+            gridSize_x = nblocks;
+
+            CUDA_SAFE_CALL(cuModuleGetFunction(&kernel, module, "GpuConv1DOnDevice_ranges"));
+            // std::cout << "GpuConv1DOnDevice_ranges " << nx << " " << gridSize_x ;
+            void *kernel_params[9];
+            kernel_params[0] = &nx;
+            kernel_params[1] = &ny;
+            kernel_params[2] = &SS.nbatchdims;
+            kernel_params[3] = &offsets_d;
+            kernel_params[4] = &lookup_d;
+            kernel_params[5] = &slices_x_d;
+            kernel_params[6] = &ranges_y_d;
+            kernel_params[7] = &out_d;
+            kernel_params[8] = &arg_d;
+
+            CUDA_SAFE_CALL(cuLaunchKernel(kernel,
+                                          gridSize_x, gridSize_y, gridSize_z,       // grid dim
+                                          blockSize_x, blockSize_y, blockSize_z,    // block dim
+                                          blockSize_x * dimY * sizeof(TYPE), NULL,  // shared mem and stream
+                                          kernel_params, 0));                       // arguments
+
+        } else {
+            // simple mode
+
+            gridSize_x = nx / blockSize_x + (nx % blockSize_x == 0 ? 0 : 1);
+
+            CUDA_SAFE_CALL(cuModuleGetFunction(&kernel, module, "GpuConv1DOnDevice"));
+
+            void *kernel_params[4];
+            kernel_params[0] = &nx;
+            kernel_params[1] = &ny;
+            kernel_params[2] = &out_d;
+            kernel_params[3] = &arg_d;
+
+            //std::cout << "GpuConv1DOnDevice " << nx << " " << gridSize_x ;//<< " " << gridSize_y << " " << gridSize_z << " " <<  blockSize_x << " " << blockSize_y << " " <<  blockSize_z << " " <<   blockSize_x * dimY * sizeof(TYPE)  << std::endl;
+
+            CUDA_SAFE_CALL(cuLaunchKernel(kernel,
+                                          gridSize_x, gridSize_y, gridSize_z,        // grid dim
+                                          blockSize_x, blockSize_y, blockSize_z,     // block dim
+                                          blockSize_x * dimY * sizeof(TYPE), NULL,   // shared mem and stream
+                                          kernel_params, 0));                        // arguments
+        }
+
+        CUDA_SAFE_CALL(cuCtxSynchronize());
+
+        ////end_ = clock();
+        ////std::cout << "  time for kernel : " << double(//end_ - start_) / CLOCKS_PER_SEC << std::endl;
+        //start_ = clock();
+
+        // Send data from device to host.
+        
+
+        if (tagHostDevice == 0) {
+            
+            CUDA_SAFE_CALL(cuMemcpyDtoH(out, (CUdeviceptr) out_d, sizeof(TYPE) * sizeout));
+            CUDA_SAFE_CALL(cuMemFree(p_data));
+            
+        }
+
+        if (RR.tagRanges == 1) {
+            CUDA_SAFE_CALL(cuMemFree((CUdeviceptr) lookup_d));
+            if (SS.nbatchdims > 0) {
+                CUDA_SAFE_CALL(cuMemFree((CUdeviceptr) slices_x_d));
+                CUDA_SAFE_CALL(cuMemFree((CUdeviceptr) ranges_y_d));
+                CUDA_SAFE_CALL(cuMemFree((CUdeviceptr) offsets_d));
+            }
+        }
+
+        //end_ = end = clock();
+        ////std::cout << "  time for last part : " << double(//end_ - start_) / CLOCKS_PER_SEC << std::endl;
+        ////std::cout << "time for launch_keops inner : " << double(end - start) / CLOCKS_PER_SEC << std::endl;
+
+        return 0;
+    }
+
+};
+
+
+template
+class KeOps_module< float >;
+
+template
+class KeOps_module< double >;
+
+template
+class KeOps_module< half2 >;
diff --git a/keopscore/keopscore/binders/nvrtc/nvrtc_jit_win.cpp b/keopscore/keopscore/binders/nvrtc/nvrtc_jit_win.cpp
new file mode 100644
index 000000000..39bb6c72c
--- /dev/null
+++ b/keopscore/keopscore/binders/nvrtc/nvrtc_jit_win.cpp
@@ -0,0 +1,114 @@
+// nvcc -shared -Xcompiler -fPIC -lnvrtc -lcuda keops_nvrtc.cu -o keops_nvrtc.so
+// g++ --verbose -L/opt/cuda/lib64 -L/opt/cuda/targets/x86_64-linux/lib/ -I/opt/cuda/targets/x86_64-linux/include/ -I../../include -shared -fPIC -lcuda -lnvrtc -fpermissive -DMAXIDGPU=0 -DMAXTHREADSPERBLOCK0=1024 -DSHAREDMEMPERBLOCK0=49152 -DnvrtcGetTARGET=nvrtcGetCUBIN -DnvrtcGetTARGETSize=nvrtcGetCUBINSize -DARCHTAG=\"sm\" keops_nvrtc.cpp -o keops_nvrtc.so
+// g++ -std=c++11  -shared -fPIC -O3 -fpermissive -L /usr/lib -L /opt/cuda/lib64 -lcuda -lnvrtc -DnvrtcGetTARGET=nvrtcGetCUBIN -DnvrtcGetTARGETSize=nvrtcGetCUBINSize -DARCHTAG=\"sm\"  -I/home/bcharlier/projets/keops/keops/keops/include -I/opt/cuda/include -I/usr/include/python3.10/ -DMAXIDGPU=0 -DMAXTHREADSPERBLOCK0=1024 -DSHAREDMEMPERBLOCK0=49152  /home/bcharlier/projets/keops/keops/keops/binders/nvrtc/keops_nvrtc.cpp -o keops_nvrtc.cpython-310-x86_64-linux-gnu.so
+
+#include <nvrtc.h>
+#include <cuda.h>
+#include <stdio.h>
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <stdarg.h>
+#include <string.h>
+#include <vector>
+//#include <ctime>
+
+#define C_CONTIGUOUS 1
+#define USE_HALF 0
+
+#ifdef _WIN32
+#define DLL_EXPORT extern "C" __declspec(dllexport)
+#else
+#define DLL_EXPORT extern "C"
+#endif
+
+#include "include/Sizes_win.h"
+#include "include/Ranges_win.h"
+#include "include/utils_pe_win.h"
+#include "include/ranges_utils_win.h"
+
+
+#include "include/CudaSizes_win.h"
+#include <cuda_fp16.h>
+
+DLL_EXPORT
+int Compile(const char *target_file_name, const char *cu_code, int use_half, int device_id,
+                       const char *cuda_include_path) {
+
+    nvrtcProgram prog;
+
+    int numHeaders;
+    const char *header_names[2];
+    const char *header_sources[2];
+
+    std::ostringstream cuda_fp16_h_path, cuda_fp16_hpp_path;
+    cuda_fp16_h_path << cuda_include_path << "cuda_fp16.h" ;
+    cuda_fp16_hpp_path << cuda_include_path << "cuda_fp16.hpp" ;
+
+    if (use_half) {
+        numHeaders = 2;
+        header_names[0] = "cuda_fp16.h";
+        header_sources[0] = read_text_file(cuda_fp16_h_path.str().c_str());
+
+        header_names[1] = "cuda_fp16.hpp";
+        header_sources[1] = read_text_file(cuda_fp16_hpp_path.str().c_str());
+
+    } else {
+        numHeaders = 0;
+    }
+
+    // Get device id from Driver API
+    CUdevice cuDevice;
+    CUDA_SAFE_CALL(cuDeviceGet(&cuDevice, device_id));
+
+    // Get Compute Capability from Driver API
+    int deviceProp_major, deviceProp_minor;
+    CUDA_SAFE_CALL(cuDeviceGetAttribute(&deviceProp_major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
+    CUDA_SAFE_CALL(cuDeviceGetAttribute(&deviceProp_minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
+
+    std::ostringstream arch_flag;
+    arch_flag << "-arch=" << ARCHTAG << "_" << deviceProp_major << deviceProp_minor;
+
+    char *arch_flag_char = new char[arch_flag.str().length()];
+    arch_flag_char = strdup(arch_flag.str().c_str());
+    const char *opts[] = {arch_flag_char, "-use_fast_math"};
+
+    NVRTC_SAFE_CALL(nvrtcCreateProgram(&prog,         // prog
+                                       cu_code,         // buffer
+                                       NULL,            // name
+                                       numHeaders,      // numHeaders
+                                       header_sources,  // headers
+                                       header_names     // includeNames
+                                      ));
+
+    nvrtcResult compileResult = nvrtcCompileProgram(prog,     // prog
+                                2,              // numOptions
+                                opts);          // options
+
+    if (compileResult != NVRTC_SUCCESS) {
+        throw std::runtime_error("[KeOps] Error when compiling formula (error in nvrtcCompileProgram).");
+    }
+
+    delete[] arch_flag_char;
+
+    // Obtain PTX or CUBIN from the program.
+    size_t targetSize;
+    NVRTC_SAFE_CALL(nvrtcGetTARGETSize(prog, &targetSize));
+
+    char *target = new char[targetSize];
+    NVRTC_SAFE_CALL(nvrtcGetTARGET(prog, target));
+
+    // Destroy the program.
+    NVRTC_SAFE_CALL(nvrtcDestroyProgram(&prog));
+
+    // write PTX code to file
+
+    std::ofstream wf(target_file_name, std::ofstream::binary);
+    wf.write((char*)&targetSize, sizeof(size_t));
+    wf.write(target, targetSize);
+    wf.close();
+
+    delete[] target;
+
+    return 0;
+}
diff --git a/keopscore/keopscore/config/__init__.py b/keopscore/keopscore/config/__init__.py
index fc31a0b9f..839b05793 100644
--- a/keopscore/keopscore/config/__init__.py
+++ b/keopscore/keopscore/config/__init__.py
@@ -1,6 +1,13 @@
 # Import the configuration classes
 from .base_config import Config
-from .cuda import CUDAConfig
+import os
+
+if os.name != "nt":
+    from .cuda import CUDAConfig
+else:
+    from .cuda_windows import CUDAConfigWin as CUDAConfig
+
+# TODO openmp and c++ compiler detection for windows
 from .openmp import OpenMPConfig
 from .Platform import DetectPlatform
 
diff --git a/keopscore/keopscore/config/cuda_windows.py b/keopscore/keopscore/config/cuda_windows.py
new file mode 100644
index 000000000..0660ea010
--- /dev/null
+++ b/keopscore/keopscore/config/cuda_windows.py
@@ -0,0 +1,212 @@
+import os
+import ctypes
+from ctypes.util import find_library
+from ctypes import (
+    c_int,
+    c_void_p,
+    c_char_p,
+    CDLL,
+    byref,
+    cast,
+    POINTER,
+    Structure,
+    RTLD_GLOBAL,
+)
+from pathlib import Path
+import shutil
+from os.path import join
+import platform
+import tempfile
+import subprocess
+import sys
+import keopscore
+from keopscore.utils.misc_utils import KeOps_Warning
+from keopscore.utils.misc_utils import KeOps_OS_Run
+from keopscore.utils.misc_utils import CHECK_MARK, CROSS_MARK
+
+from .cuda import CUDAConfig
+
+from ..windows_compilations import cuda_detection
+
+
+cuda_available = cuda_detection.cuda_available
+
+detection = cuda_detection.detect_cuda_toolkit()
+# cuda_lib = detection['lib_dirs']
+# cuda_include = detection['include_dir']
+# cuda_dll =  detection['dll_cuda']
+# cudart_dll =  detection['dll_cudart']
+# cuda_nvrtc =  detection['dll_nvrtc']
+
+
+class CUDAConfigWin(CUDAConfig):
+    """
+    Class for CUDA detection on windows and configuration.
+    """
+
+    # CUDA constants
+    CUDA_SUCCESS = 0
+    CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1
+    CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8
+
+    def set_use_cuda(self):
+        """Determine and set whether to use CUDA."""
+        self._use_cuda = cuda_detection.cuda_available
+
+        if not self._use_cuda:
+            self.cuda_message = "CUDA libraries not detected; Switching to CPU only."
+            KeOps_Warning(self.cuda_message)
+
+        # Check if both cuda and nvrtc libraries are available
+        if not self._cuda_libraries_available():
+            self._use_cuda = False
+
+        self.get_cuda_version()
+        self.get_cuda_include_path()
+        self.get_gpu_props()
+
+        if self.n_gpus == 0 and self._use_cuda:
+            self._use_cuda = False
+            self.cuda_message = "CUDA libraries detected, but no GPUs found on this system; Switching to CPU only."
+            KeOps_Warning(self.cuda_message)
+
+    def _cuda_libraries_available(self):
+        """
+        Check if both cuda and nvrtc libraries are available.
+        Returns:
+            True if both cuda and nvrtc are loadable, False otherwise.
+            This is also where we handle one single warning if needed.
+        """
+
+        return "dll_nvrtc" in detection and "dll_cuda" in detection
+
+    def get_cuda_version(self, out_type="single_value"):
+
+        if not self._use_cuda:
+            self.cuda_version = None
+            return None
+        try:
+
+            libcudart = ctypes.CDLL(detection["dll_cudart"])
+            cuda_version = ctypes.c_int()
+            libcudart.cudaRuntimeGetVersion(ctypes.byref(cuda_version))
+            cuda_version_value = int(cuda_version.value)
+
+            if out_type == "single_value":
+                self.cuda_version = cuda_version_value
+                return cuda_version_value
+
+            major = cuda_version_value // 1000
+            minor = (cuda_version_value % 1000) // 10
+
+            if out_type == "major,minor":
+                return major, minor
+            elif out_type == "string":
+                return f"{major}.{minor}"
+        except Exception as e:
+            KeOps_Warning(f"Could not determine CUDA version: {e}")
+            self.cuda_version = None
+            return None
+
+    def get_gpu_props(self):
+        """
+        Getting GPU properties and related attributes.
+        """
+        if not self._use_cuda:
+            # Already determined that CUDA is unavailable
+            self.n_gpus = 0
+            self.gpu_compile_flags = ""
+            return (self.n_gpus, self.gpu_compile_flags)
+
+        # Attempt to load the CUDA driver library
+        libcuda_path = detection["dll_cuda"]
+
+        # We have a handle, let's proceed
+        libcuda = ctypes.CDLL(libcuda_path)
+        result = libcuda.cuInit(0)
+        if result != self.CUDA_SUCCESS:
+            KeOps_Warning(
+                "CUDA was detected, but driver API could not be initialized. Switching to CPU only."
+            )
+            self.n_gpus = 0
+            self.gpu_compile_flags = ""
+            self._use_cuda = False
+            return (self.n_gpus, self.gpu_compile_flags)
+
+        # Get GPU count
+        nGpus = ctypes.c_int()
+        result = libcuda.cuDeviceGetCount(ctypes.byref(nGpus))
+        if result != self.CUDA_SUCCESS:
+            KeOps_Warning(
+                "CUDA was detected and driver API was initialized, but no working GPU found. "
+                "Switching to CPU only."
+            )
+            self.n_gpus = 0
+            self.gpu_compile_flags = ""
+            self._use_cuda = False
+            return (self.n_gpus, self.gpu_compile_flags)
+
+        self.n_gpus = nGpus.value
+        if self.n_gpus == 0:
+            self.gpu_compile_flags = ""
+            return (self.n_gpus, self.gpu_compile_flags)
+
+        # Query each GPU for properties
+        MaxThreadsPerBlock = [0] * self.n_gpus
+        SharedMemPerBlock = [0] * self.n_gpus
+
+        def safe_call(dev_idx, result_code):
+            if result_code != self.CUDA_SUCCESS:
+                KeOps_Warning(
+                    f"Error detecting properties for GPU device {dev_idx}. "
+                    "Switching to CPU only."
+                )
+                return False
+            return True
+
+        for d in range(self.n_gpus):
+            device = ctypes.c_int()
+            if not safe_call(d, libcuda.cuDeviceGet(ctypes.byref(device), d)):
+                self.n_gpus = 0
+                self.gpu_compile_flags = ""
+                self._use_cuda = False
+                return (self.n_gpus, self.gpu_compile_flags)
+
+            output = ctypes.c_int()
+            if not safe_call(
+                d,
+                libcuda.cuDeviceGetAttribute(
+                    byref(output),
+                    self.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
+                    device,
+                ),
+            ):
+                self.n_gpus = 0
+                self.gpu_compile_flags = ""
+                self._use_cuda = False
+                return (self.n_gpus, self.gpu_compile_flags)
+            MaxThreadsPerBlock[d] = output.value
+
+            if not safe_call(
+                d,
+                libcuda.cuDeviceGetAttribute(
+                    byref(output),
+                    self.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK,
+                    device,
+                ),
+            ):
+                self.n_gpus = 0
+                self.gpu_compile_flags = ""
+                self._use_cuda = False
+                return (self.n_gpus, self.gpu_compile_flags)
+            SharedMemPerBlock[d] = output.value
+
+        # Build compile flags string #TODO auto detection for macros is here
+        self.gpu_compile_flags = f"-DMAXIDGPU={self.n_gpus - 1} "
+        for d in range(self.n_gpus):
+            self.gpu_compile_flags += (
+                f"-DMAXTHREADSPERBLOCK{d}={MaxThreadsPerBlock[d]} "
+            )
+            self.gpu_compile_flags += f"-DSHAREDMEMPERBLOCK{d}={SharedMemPerBlock[d]} "
+
+        return self.n_gpus, self.gpu_compile_flags
diff --git a/keopscore/keopscore/include/CudaSizes_win.h b/keopscore/keopscore/include/CudaSizes_win.h
new file mode 100644
index 000000000..09862a030
--- /dev/null
+++ b/keopscore/keopscore/include/CudaSizes_win.h
@@ -0,0 +1,96 @@
+#pragma once
+
+/////////////////////////////////////////////
+//            GPU     Options             //
+/////////////////////////////////////////////
+
+
+
+// fix some Gpu properties
+// CUDA_BLOCK_SIZE gives an upper bound on size of the size of Cuda blocks
+// The actual block size may be lower due to memory limitations, depending on the formula used
+#ifndef CUDA_BLOCK_SIZE
+#define CUDA_BLOCK_SIZE 192
+#endif
+// Here we define the maximum number of threads per block and the shared memory per block
+// These values can depend on the Gpu, although in fact values 1024 and 49152 respectively
+// are the good values for almost all cards.
+// So these values should be fine, but you can check them with GetGpuProps.cu program
+// Here we assume that: either the user has defined MAXIDGPU (=number of Gpu devices minus one)
+// and corresponding specific values MAXTHREADSPERBLOCK0, SHAREDMEMPERBLOCK0, MAXTHREADSPERBLOCK1, SHAREDMEMPERBLOCK1, ...
+// for each device, or MAXIDGPU is not defined, and we will use global MAXTHREADSPERBLOCK and SHAREDMEMPERBLOCK
+#ifndef MAXIDGPU
+// we give default values
+#ifndef MAXTHREADSPERBLOCK
+#define MAXTHREADSPERBLOCK 1024
+#endif
+#ifndef SHAREDMEMPERBLOCK
+#define SHAREDMEMPERBLOCK 49152
+#endif
+#endif
+
+// global variables maxThreadsPerBlock and sharedMemPerBlock may depend on the device, so we will set them at each call using
+// predefined MAXTHREADSPERBLOCK0, SHAREDMEMPERBLOCK0, MAXTHREADSPERBLOCK1, SHAREDMEMPERBLOCK1, etc.
+// through the function SetGpuProps
+int maxThreadsPerBlock, sharedMemPerBlock;
+
+#define SET_GPU_PROPS_MACRO(n) \
+    if(device == n) { \
+      maxThreadsPerBlock = MAXTHREADSPERBLOCK ## n; \
+      sharedMemPerBlock = SHAREDMEMPERBLOCK ## n; \
+      return; \
+    }
+
+// I have not managed to use a "recursive macro" hack, it was not compiling on all systems.
+// This assumes the number of Gpus is <= 10 ; feel free to add more lines if needed !
+void SetGpuProps(int device) {
+
+#if defined(MAXTHREADSPERBLOCK) && defined(SHAREDMEMPERBLOCK)
+    // global values are defined
+    maxThreadsPerBlock = MAXTHREADSPERBLOCK;
+    sharedMemPerBlock = SHAREDMEMPERBLOCK;
+    return;
+#else
+#if MAXIDGPU >= 0
+    SET_GPU_PROPS_MACRO(0)
+#endif
+#if MAXIDGPU >= 1
+    SET_GPU_PROPS_MACRO(1)
+#endif
+#if MAXIDGPU >= 2
+    SET_GPU_PROPS_MACRO(2)
+#endif
+#if MAXIDGPU >= 3
+    SET_GPU_PROPS_MACRO(3)
+#endif
+#if MAXIDGPU >= 4
+    SET_GPU_PROPS_MACRO(4)
+#endif
+#if MAXIDGPU >= 5
+    SET_GPU_PROPS_MACRO(5)
+#endif
+#if MAXIDGPU >= 6
+    SET_GPU_PROPS_MACRO(6)
+#endif
+#if MAXIDGPU >= 7
+    SET_GPU_PROPS_MACRO(7)
+#endif
+#if MAXIDGPU >= 8
+    SET_GPU_PROPS_MACRO(8)
+#endif
+#if MAXIDGPU >= 9
+    SET_GPU_PROPS_MACRO(9)
+#endif
+#if MAXIDGPU >= 10
+    SET_GPU_PROPS_MACRO(10)
+#endif
+#if MAXIDGPU >= 11
+    SET_GPU_PROPS_MACRO(11)
+#endif
+    fprintf( stderr, "invalid Gpu device number. If the number of available Gpus is > 12, add required lines at the end of function SetGpuProps and recompile.\n");
+    throw std::runtime_error("[KeOps] Cuda error.");
+#endif
+
+}
+
+
diff --git a/keopscore/keopscore/include/Ranges_win.h b/keopscore/keopscore/include/Ranges_win.h
new file mode 100644
index 000000000..19270499e
--- /dev/null
+++ b/keopscore/keopscore/include/Ranges_win.h
@@ -0,0 +1,99 @@
+#pragma once
+
+#include "Sizes_win.h"
+
+template< typename TYPE >
+class Ranges {
+public:
+    int tagRanges, nranges_x, nranges_y, nredranges_x, nredranges_y;
+
+    std::vector< int > ranges_i, slices_i, redranges_j;
+    std::vector< int * > _castedranges;
+    int **castedranges;
+
+    Ranges(Sizes< TYPE > sizes, int **ranges) {
+
+        _castedranges.resize(6);
+
+        // Sparsity: should we handle ranges? ======================================
+        if (sizes.nbatchdims == 0) {  // Standard M-by-N computation
+            if (ranges[6][0] == -1) {
+                tagRanges = 0;
+
+                nranges_x = 0;
+                nranges_y = 0;
+
+                nredranges_x = 0;
+                nredranges_y = 0;
+
+            } else {
+                tagRanges = 1;
+                nranges_x = ranges[6][0];
+                nranges_y = ranges[6][3];
+                nredranges_x = ranges[6][5];
+                nredranges_y = ranges[6][2];
+
+                // get the pointers to data to avoid a copy
+                for (int i = 0; i < 6; i++) {
+                    _castedranges[i] = ranges[i];
+                }
+            }
+
+        } else if (ranges[6][0] == -1) {
+            // Batch processing: we'll have to generate a custom, block-diagonal sparsity pattern
+            tagRanges = 1;  // Batch processing is emulated through the block-sparse mode
+
+            // Create new "castedranges" from scratch ------------------------------
+            // With pythonic notations, we'll have:
+            //   castedranges = (ranges_i, slices_i, redranges_j,   ranges_j, slices_j, redranges_i)
+            // with:
+            // - ranges_i    = redranges_i = [ [0,M], [M,2M], ..., [(nbatches-1)M, nbatches*M] ]
+            // - slices_i    = slices_j    = [    1,     2,   ...,   nbatches-1,   nbatches    ]
+            // - redranges_j = ranges_j    = [ [0,N], [N,2N], ..., [(nbatches-1)N, nbatches*N] ]
+
+
+            //int ranges_i[2 * sizes.nbatches];  // ranges_i
+            ranges_i.resize(2 * sizes.nbatches, 0);
+
+            //int slices_i[sizes.nbatches];    // slices_i
+            slices_i.resize(sizes.nbatches, 0);
+
+            //int redranges_j[2 * sizes.nbatches];  // redranges_j
+            redranges_j.resize(2 * sizes.nbatches, 0);
+
+            for (int b = 0; b < sizes.nbatches; b++) {
+                ranges_i[2 * b] = b * sizes.M;
+                ranges_i[2 * b + 1] = (b + 1) * sizes.M;
+                slices_i[b] = (b + 1);
+                redranges_j[2 * b] = b * sizes.N;
+                redranges_j[2 * b + 1] = (b + 1) * sizes.N;
+            }
+
+            _castedranges[0] = &ranges_i[0];
+            _castedranges[1] = &slices_i[0];
+            _castedranges[2] = &redranges_j[0];
+            _castedranges[3] = &redranges_j[0];         // ranges_j
+            _castedranges[4] = &slices_i[0];            // slices_j
+            _castedranges[5] = &ranges_i[0];            // redranges_i
+
+            nranges_x = sizes.nbatches;
+            nredranges_x = sizes.nbatches;
+            nranges_y = sizes.nbatches;
+            nredranges_y = sizes.nbatches;
+        }
+
+        castedranges = &_castedranges[0];
+
+#if do_checks
+        else {
+            throw std::runtime_error(
+                "[KeOps] The 'ranges' argument (block-sparse mode) is not supported with batch processing, "
+                "but we detected " + std::to_string(sizes.nbatchdims) + " > 0 batch dimensions."
+            );
+        }
+#endif
+
+
+    };
+
+};
diff --git a/keopscore/keopscore/include/Sizes_win.h b/keopscore/keopscore/include/Sizes_win.h
new file mode 100644
index 000000000..c78f30e7c
--- /dev/null
+++ b/keopscore/keopscore/include/Sizes_win.h
@@ -0,0 +1,402 @@
+#pragma once
+
+#include <algorithm>
+#include <vector>
+#include <numeric>
+#include <functional>
+
+
+#define MIN(a, b) (((a)<(b))?(a):(b))
+#define MAX(a, b) (((a)<(b))?(b):(a))
+#define MAX3(a, b, c) (MAX(MAX(a,b),c))
+
+#define do_checks 0
+#if do_checks
+void error(std::string message) {
+    throw std::runtime_error(message);
+}
+#endif
+
+
+#if C_CONTIGUOUS
+
+int get_val_batch(std::vector< int > _shape, int nbatch, int b) {
+    return _shape[b];
+}
+
+#else
+
+int get_val_batch(std::vector< int > _shape, int nbatch, int b) {
+    return _shape[nbatch - b];
+}
+
+#endif
+
+template< typename TYPE >
+class Sizes {
+public:
+
+    // attributs
+    int nargs;
+    int nx, ny;
+    int M, N;
+    int nbatchdims;
+    int nbatches;
+
+    std::vector< int > _shapes;
+    int *shapes;
+    std::vector< int > _shape_out;
+    int *shape_out;
+
+    int tagIJ;
+    int use_half;
+    std::vector< int > indsI;
+    std::vector< int > indsJ;
+    std::vector< int > indsP;
+    int pos_first_argI;
+    int pos_first_argJ;
+    int dimout;
+    int nminargs;
+    int nvarsI;
+    int nvarsJ;
+    int nvarsP;
+    std::vector< int > dimsX;
+    std::vector< int > dimsY;
+    std::vector< int > dimsP;
+
+    // constructors
+    Sizes(int _nargs, TYPE **args, const std::vector <std::vector< int >> &argshapes, int _nx, int _ny,
+          int tagIJ_, int use_half_, int dimout_,
+          const std::vector< int > &indsI_, std::vector< int > indsJ_, const std::vector< int > &indsP_,
+          const std::vector< int > &dimsX_, std::vector< int > dimsY_, const std::vector< int > &dimsP_) {
+
+        tagIJ = tagIJ_;
+        use_half = use_half_;
+        indsI = indsI_;
+        indsJ = indsJ_;
+        indsP = indsP_;
+        dimout = dimout_;
+
+        nvarsI = indsI.size();
+        nvarsJ = indsJ.size();
+        nvarsP = indsP.size();
+
+        pos_first_argI = (nvarsI > 0) ? *std::min_element(indsI.begin(), indsI.end()) : -1;
+        pos_first_argJ = (nvarsJ > 0) ? *std::min_element(indsJ.begin(), indsJ.end()) : -1;
+
+        int max_i = (nvarsI > 0) ? *std::max_element(indsI.begin(), indsI.end()) : -1;
+        int max_j = (nvarsJ > 0) ? *std::max_element(indsJ.begin(), indsJ.end()) : -1;
+        int max_p = (nvarsP > 0) ? *std::max_element(indsP.begin(), indsP.end()) : -1;
+
+        nminargs = 1 + MAX3(max_i, max_j, max_p);
+        dimsX = dimsX_;
+        dimsY = dimsY_;
+        dimsP = dimsP_;
+        nargs = _nargs;
+        nx = _nx;
+        ny = _ny;
+
+        // fill shapes wit "batch dimensions" [A, .., B], the table will look like:
+        //
+        // [ A, .., B, M, N, D_out]  -> output
+        // [ A, .., B, M, 1, D_1  ]  -> "i" variable
+        // [ A, .., B, 1, N, D_2  ]  -> "j" variable
+        // [ A, .., B, 1, 1, D_3  ]  -> "parameter"
+        // [ A, .., 1, M, 1, D_4  ]  -> N.B.: we support broadcasting on the batch dimensions!
+        // [ 1, .., 1, M, 1, D_5  ]  ->      (we'll just ask users to fill in the shapes with *explicit* ones)
+        fill_shape(nargs, argshapes);
+
+        check_ranges(argshapes);
+
+        // fill shape_out
+        _shape_out.resize(nbatchdims + 3);
+
+#if C_CONTIGUOUS
+        std::copy(_shapes.begin(), _shapes.begin() + nbatchdims + 3, _shape_out.begin());// Copy the "batch dimensions"
+        _shape_out.erase(_shape_out.begin() + nbatchdims + (1 - tagIJ));
+
+#else
+        std::reverse_copy(_shapes.begin(), _shapes.begin() + nbatchdims + 3,
+                          _shape_out.begin());// Copy the "batch dimensions"
+        _shape_out.erase(_shape_out.begin() + 1
+        tagIJ);
+
+#endif
+
+        // fill nx and ny
+        M = _shapes[nbatchdims];      // = M
+        N = _shapes[nbatchdims + 1];  // = N
+
+        // Compute the product of all "batch dimensions"
+        nbatches = std::accumulate(_shapes.begin(), _shapes.begin() + nbatchdims, 1, std::multiplies< int >());
+
+        nx = nbatches * M;  // = A * ... * B * M
+        ny = nbatches * N;  // = A * ... * B * N
+
+        shapes = &_shapes[0];
+        shape_out = &_shape_out[0];
+    }
+
+
+    // methods
+
+    void switch_to_half2_indexing();
+
+private:
+    void fill_shape(const int nargs, const std::vector <std::vector< int >> &argshapes);
+
+    void check_ranges(const std::vector <std::vector< int >> &argshapes);
+
+    int MN_pos, D_pos;
+};
+
+
+template< typename TYPE >
+void Sizes< TYPE >::fill_shape(const int nargs, const std::vector <std::vector< int >> &argshapes) {
+
+    int pos = std::max(pos_first_argI, pos_first_argJ);
+
+    if (pos > -1) {
+        // Are we working in batch mode? Infer the answer from the first arg =============
+        nbatchdims = argshapes[pos].size() - 2;  // number of batch dimensions = Number of dims of the first tensor - 2
+
+        if (nbatchdims < 0) {
+#if do_checks
+            error("[KeOps] Wrong number of dimensions for arg at position 0: is "
+                  + std::to_string(argshapes[0].size()) + " but should be at least 2."
+                 );
+#endif
+        }
+    } else {
+        nbatchdims = 0;
+    }
+
+#if C_CONTIGUOUS
+    MN_pos = nbatchdims;
+    D_pos = nbatchdims + 1;
+#else
+    D_pos = 0;
+    MN_pos = 1;
+#endif
+
+    // Now, we'll keep track of the output + all arguments' shapes in a large array:
+    _shapes.resize((nargs + 1) * (nbatchdims + 3), 1);
+
+    if (use_half) {
+        if (tagIJ == 0) {
+            _shapes[nbatchdims] = nx % 2 ? nx + 1 : nx;
+            _shapes[nbatchdims + 1] = 2 * ny;
+        } else {
+            _shapes[nbatchdims] = 2 * nx;
+            _shapes[nbatchdims + 1] = ny % 2 ? ny + 1 : ny;
+        }
+    } else {
+        _shapes[nbatchdims] = nx;
+        _shapes[nbatchdims + 1] = ny;
+    }
+
+    _shapes[nbatchdims + 2] = dimout;   // Top right corner: dimension of the output
+
+}
+
+template< typename TYPE >
+void Sizes< TYPE >::check_ranges(const std::vector <std::vector< int >> &argshapes) {
+
+    // Check the compatibility of all tensor shapes ==================================
+    if (nminargs > 0) {
+
+        // Checks args in all the positions that correspond to "i" variables:
+        for (int k = 0; k < nvarsI; k++) {
+            int i = indsI[k];
+
+            // Fill in the (i+1)-th line of the "shapes" array ---------------------------
+            int off_i = (i + 1) * (nbatchdims + 3);
+
+            // Check the number of dimensions --------------------------------------------
+            int ndims = argshapes[i].size();  // Number of dims of the i-th tensor
+
+#if do_checks
+            if (ndims != nbatchdims + 2) {
+                error("[KeOps] Wrong number of dimensions for arg at position " + std::to_string(i)
+                      + " (i type): KeOps detected " + std::to_string(nbatchdims)
+                      + " batch dimensions from the first argument 0, and thus expected "
+                      + std::to_string(nbatchdims + 2)
+                      + " dimensions here, but only received "
+                      + std::to_string(ndims)
+                      + ". Note that KeOps supports broadcasting on batch dimensions, "
+                      + "but still expects 'dummy' unit dimensions in the input shapes, "
+                      + "for the sake of clarity.");
+            }
+#endif
+
+
+
+            // First, the batch dimensions:
+            for (int b = 0; b < nbatchdims; b++) {
+                _shapes[off_i + b] = get_val_batch(argshapes[i], nbatchdims + 2, b);
+
+                // Check that the current value is compatible with what
+                // we've encountered so far, as stored in the first line of "shapes"
+                if (_shapes[off_i + b] != 1) {  // This dimension is not "broadcasted"
+                    if (_shapes[b] == 1) {
+                        _shapes[b] = _shapes[off_i + b];  // -> it becomes the new standard
+                    }
+#if do_checks
+                    else if (_shapes[b] != _shapes[off_i + b]) {
+                        error("[KeOps] Wrong value of the batch dimension "
+                              + std::to_string(b) + " for argument number " + std::to_string(i)
+                              + " : is " + std::to_string(_shapes[off_i + b])
+                              + " but was " + std::to_string(_shapes[b])
+                              + " or 1 in previous arguments.");
+                    }
+#endif
+                }
+            }
+
+            _shapes[off_i + nbatchdims] = argshapes[i][MN_pos];  // = "M"
+            _shapes[off_i + nbatchdims + 2] = argshapes[i][D_pos];  // = "D"
+
+
+#if do_checks
+            // Check the number of "lines":
+            if (_shapes[nbatchdims] != _shapes[off_i + nbatchdims]) {
+                error("[KeOps] Wrong value of the 'i' dimension "
+                      + std::to_string(nbatchdims) + "for arg at position " + std::to_string(i)
+                      + " : is " + std::to_string(_shapes[off_i + nbatchdims])
+                      + " but was " + std::to_string(_shapes[nbatchdims])
+                      + " in previous 'i' arguments.");
+            }
+
+            // And the number of "columns":
+            if (_shapes[off_i + nbatchdims + 2] != static_cast< int >(dimsX[k])) {
+                error("[KeOps] Wrong value of the 'vector size' dimension "
+                      + std::to_string(nbatchdims + 1) + " for arg at position " + std::to_string(i)
+                      + " : is " + std::to_string(_shapes[off_i + nbatchdims + 2])
+                      + " but should be " + std::to_string(dimsX[k]));
+            }
+#endif
+        }
+
+
+        // Checks args in all the positions that correspond to "j" variables:
+        for (int k = 0; k < nvarsJ; k++) {
+            int i = indsJ[k];
+
+            // Check the number of dimensions --------------------------------------------
+            int ndims = argshapes[i].size();  // Number of dims of the i-th tensor
+
+#if do_checks
+            if (ndims != nbatchdims + 2) {
+                error("[KeOps] Wrong number of dimensions for arg at position " + std::to_string(i)
+                      + " (j type): KeOps detected " + std::to_string(nbatchdims)
+                      + " batch dimensions from the first argument 0, and thus expected "
+                      + std::to_string(nbatchdims + 2)
+                      + " dimensions here, but only received "
+                      + std::to_string(ndims)
+                      + ". Note that KeOps supports broadcasting on batch dimensions, "
+                      + "but still expects 'dummy' unit dimensions in the input shapes, "
+                      + "for the sake of clarity.");
+            }
+#endif
+
+            // Fill in the (i+1)-th line of the "shapes" array ---------------------------
+            int off_i = (i + 1) * (nbatchdims + 3);
+
+            // First, the batch dimensions:
+            for (int b = 0; b < nbatchdims; b++) {
+                _shapes[off_i + b] = get_val_batch(argshapes[i], nbatchdims + 2, b);
+
+                // Check that the current value is compatible with what
+                // we've encountered so far, as stored in the first line of "shapes"
+                if (_shapes[off_i + b] != 1) {  // This dimension is not "broadcasted"
+                    if (_shapes[b] == 1) {
+                        _shapes[b] = _shapes[off_i + b];  // -> it becomes the new standard
+                    }
+#if do_checks
+                    else if (_shapes[b] != _shapes[off_i + b]) {
+                        error("[KeOps] Wrong value of the batch dimension "
+                              + std::to_string(b) + " for argument number " + std::to_string(i)
+                              + " : is " + std::to_string(_shapes[off_i + b])
+                              + " but was " + std::to_string(_shapes[b])
+                              + " or 1 in previous arguments.");
+                    }
+#endif
+                }
+            }
+
+            _shapes[off_i + nbatchdims + 1] = argshapes[i][MN_pos];  // = "N"
+            _shapes[off_i + nbatchdims + 2] = argshapes[i][D_pos];  // = "D"
+
+
+#if do_checks
+            // Check the number of "lines":
+            if (_shapes[nbatchdims + 1] != _shapes[off_i + nbatchdims + 1]) {
+                error("[KeOps] Wrong value of the 'j' dimension "
+                      + std::to_string(nbatchdims) + " for arg at position " + std::to_string(i)
+                      + " : is " + std::to_string(_shapes[off_i + nbatchdims + 1])
+                      + " but was " + std::to_string(_shapes[nbatchdims + 1])
+                      + " in previous 'j' arguments.");
+            }
+
+            // And the number of "columns":
+            if (_shapes[off_i + nbatchdims + 2] != static_cast< int >(dimsY[k])) {
+                error("[KeOps] Wrong value of the 'vector size' dimension "
+                      + std::to_string(nbatchdims + 1) + " for arg at position " + std::to_string(i)
+                      + " : is " + std::to_string(_shapes[off_i + nbatchdims + 2])
+                      + " but should be " + std::to_string(dimsY[k]));
+            }
+#endif
+        }
+
+
+        for (int k = 0; k < nvarsP; k++) {
+            int i = indsP[k];
+            // Fill in the (i+1)-th line of the "shapes" array ---------------------------
+            int off_i = (i + 1) * (nbatchdims + 3);
+            // First, the batch dimensions:
+            for (int b = 0; b < nbatchdims; b++) {
+                _shapes[off_i + b] = get_val_batch(argshapes[i], nbatchdims + 2, b);
+            }
+            _shapes[off_i + nbatchdims + 2] = argshapes[i][nbatchdims];  // = "D"
+#if do_checks
+            int dim_param;
+            if (use_half)
+                dim_param = _shapes[off_i + nbatchdims + 2] / 2;
+            else
+                dim_param = _shapes[off_i + nbatchdims + 2];
+            if (dim_param != static_cast< int >(dimsP[k])) {
+                error("[KeOps] Wrong value of the 'vector size' dimension "
+                      + std::to_string(nbatchdims) + " for arg at position " + std::to_string(i)
+                      + " : is " + std::to_string(dim_param)
+                      + " but should be " + std::to_string(dimsP[k]));
+            }
+#endif
+        }
+    }
+
+}
+
+template< typename TYPE >
+void Sizes< TYPE >::switch_to_half2_indexing() {
+    // special case of float16 inputs : because we use half2 type in Cuda codes, we need to divide by two nx, ny, and M, N, or D
+    // values inside the shapes vector.
+    nx = nx / 2;
+    ny = ny / 2;
+    M = M / 2;
+    N = N / 2;
+    _shapes[nbatchdims] = _shapes[nbatchdims] / 2;
+    _shapes[nbatchdims + 1] = _shapes[nbatchdims + 1] / 2;
+    for (int i = 0; i < nargs; i++) {
+        int off_i = (i + 1) * (nbatchdims + 3);
+        // we don't have anymore the category information...
+        // the last three dimensions are either of the form (M,1,D), (1,N,D), or (1,1,D)
+        // where M or N are even in the 2 first cases, or D is even in the third case.
+        if (_shapes[off_i + nbatchdims] > 1)
+            _shapes[off_i + nbatchdims] = _shapes[off_i + nbatchdims] / 2;
+        else if (_shapes[off_i + nbatchdims + 1] > 1)
+            _shapes[off_i + nbatchdims + 1] = _shapes[off_i + nbatchdims + 1] / 2;
+        else
+            _shapes[off_i + nbatchdims + 2] = _shapes[off_i + nbatchdims + 2] / 2;
+    }
+}
+
diff --git a/keopscore/keopscore/include/ranges_utils_win.h b/keopscore/keopscore/include/ranges_utils_win.h
new file mode 100644
index 000000000..372b61c06
--- /dev/null
+++ b/keopscore/keopscore/include/ranges_utils_win.h
@@ -0,0 +1,82 @@
+#pragma once
+
+
+int broadcast_index(int i, int nbatchdims, int *full_shape, int *shape) {
+    int M_N = shape[nbatchdims];
+    int res = i % M_N, step = M_N, full_step = M_N;
+    for (int b = nbatchdims; b > 0; b--) {
+        if (shape[b - 1] != 1) {
+            res += ((i / full_step) % shape[b - 1]) * step;
+        }
+        full_step *= full_shape[b - 1];
+        step *= shape[b - 1];
+    }
+    return res;
+}
+
+void vect_broadcast_index(int i, int nbatchdims, int nvars, int *full_shape,
+                          int *reduced_shapes, int *out, int add_offset = 0) {
+    for (int k = 0; k < nvars; k++) {
+        out[k] = add_offset + broadcast_index(i, nbatchdims, full_shape, reduced_shapes + (nbatchdims + 1) * k);
+    }
+}
+
+void fill_shapes(int nbatchdims, int *shapes, int *shapes_i, int *shapes_j, int *shapes_p,
+                 int tagJ,
+                 const std::vector< int > &indsi,
+                 const std::vector< int > &indsj,
+                 const std::vector< int > &indsp) {
+
+    int sizei = indsi.size();
+    int sizej = indsj.size();
+    int sizep = indsp.size();
+
+    sizei += 1;
+
+    const int tagIJ = tagJ; // 1 if the reduction is made "over j", 0 if it is made "over i"
+
+    // Separate and store the shapes of the "i" and "j" variables + parameters --------------
+    //
+    // N.B.: If tagIJ == 1, the reduction is made over 'j', which is the default mode.
+    //       However, if tagIJ == 0, the reduction is performed over the 'i' variables:
+    //       since "shape" does not change, we must adapt the adress at which we pick information...
+    //
+    // shapes is an array of size (1+nargs)*(nbatchdims+3), which looks like:
+    // [ A, .., B, M, N, D_out]  -> output
+    // [ A, .., B, M, 1, D_1  ]  -> "i" variable
+    // [ A, .., B, 1, N, D_2  ]  -> "j" variable
+    // [ A, .., B, 1, 1, D_3  ]  -> "parameter"
+    // [ A, .., 1, M, 1, D_4  ]  -> N.B.: we support broadcasting on the batch dimensions!
+    // [ 1, .., 1, M, 1, D_5  ]  ->      (we'll just ask users to fill in the shapes with *explicit* ones)
+
+    // First, we fill shapes_i with the "relevant" shapes of the "i" variables,
+    // making it look like, say:
+    // [ A, .., B, M]
+    // [ A, .., 1, M]
+    // [ A, .., A, M]
+    for (int k = 0; k < (sizei - 1); k++) {  // k-th line
+        for (int l = 0; l < nbatchdims; l++) {  // l-th column
+            shapes_i[k * (nbatchdims + 1) + l] = shapes[(1 + indsi[k]) * (nbatchdims + 3) + l];
+        }
+        shapes_i[k * (nbatchdims + 1) + nbatchdims] =
+                shapes[(1 + indsi[k]) * (nbatchdims + 3) + nbatchdims + 1 - tagIJ];
+    }
+
+    // Then, we do the same for shapes_j, but with "N" instead of "M":
+    for (int k = 0; k < sizej; k++) {  // k-th line
+        for (int l = 0; l < nbatchdims; l++) {  // l-th column
+            shapes_j[k * (nbatchdims + 1) + l] = shapes[(1 + indsj[k]) * (nbatchdims + 3) + l];
+        }
+        shapes_j[k * (nbatchdims + 1) + nbatchdims] = shapes[(1 + indsj[k]) * (nbatchdims + 3) + nbatchdims +
+                                                             tagIJ];
+    }
+
+    // And finally for the parameters, with "1" instead of "M":
+    for (int k = 0; k < sizep; k++) {  // k-th line
+        for (int l = 0; l < nbatchdims; l++) {  // l-th column
+            shapes_p[k * (nbatchdims + 1) + l] = shapes[(1 + indsp[k]) * (nbatchdims + 3) + l];
+        }
+        shapes_p[k * (nbatchdims + 1) + nbatchdims] = 1;
+    }
+
+}
\ No newline at end of file
diff --git a/keopscore/keopscore/include/utils_pe.h b/keopscore/keopscore/include/utils_pe.h
index 55c91babc..56ef5e99b 100644
--- a/keopscore/keopscore/include/utils_pe.h
+++ b/keopscore/keopscore/include/utils_pe.h
@@ -1,5 +1,6 @@
 #include <cuda.h>
 #include <numeric>
+#include <vector>
 
 #define NVRTC_SAFE_CALL(x)                                                     \
   do {                                                                         \
@@ -68,7 +69,8 @@ void load_args_FromHost(CUdeviceptr &p_data, TYPE *out, TYPE *&out_d, int nargs,
                         TYPE **arg, TYPE **&arg_d,
                         const std::vector<std::vector<signed long int>> &argshape,
                         signed long int sizeout) {
-  signed long int sizes[nargs];
+
+  std::vector<signed long int> sizes(nargs);
   signed long int totsize = sizeout;
   for (int k = 0; k < nargs; k++) {
     sizes[k] = std::accumulate(argshape[k].begin(), argshape[k].end(), 1,
@@ -83,7 +85,7 @@ void load_args_FromHost(CUdeviceptr &p_data, TYPE *out, TYPE *&out_d, int nargs,
   TYPE *dataloc = (TYPE *)(arg_d + nargs);
 
   // host array of pointers to device data
-  TYPE *ph[nargs];
+  std::vector<TYPE *> ph(nargs);
 
   out_d = dataloc;
   dataloc += sizeout;
@@ -95,5 +97,5 @@ void load_args_FromHost(CUdeviceptr &p_data, TYPE *out, TYPE *&out_d, int nargs,
   }
 
   // copy array of pointers
-  CUDA_SAFE_CALL(cuMemcpyHtoD((CUdeviceptr)arg_d, ph, nargs * sizeof(TYPE *)));
+  CUDA_SAFE_CALL(cuMemcpyHtoD((CUdeviceptr)arg_d, ph.data(), nargs * sizeof(TYPE *)));
 }
diff --git a/keopscore/keopscore/include/utils_pe_win.h b/keopscore/keopscore/include/utils_pe_win.h
new file mode 100644
index 000000000..6e172e476
--- /dev/null
+++ b/keopscore/keopscore/include/utils_pe_win.h
@@ -0,0 +1,100 @@
+#include <numeric>
+#include <cuda.h>
+
+#define NVRTC_SAFE_CALL(x)                                        \
+  do {                                                            \
+    nvrtcResult result = x;                                       \
+    if (result != NVRTC_SUCCESS) {                                \
+      std::cerr << "\nerror: " #x " failed with error "           \
+                << nvrtcGetErrorString(result) << '\n' << '\n';   \
+      throw std::runtime_error("[KeOps] NVRTC error.");           \
+    }                                                             \
+  } while(0)
+
+#define CUDA_SAFE_CALL_NO_EXCEPTION(x)                            \
+  do {                                                            \
+    CUresult result = x;                                          \
+    if (result != CUDA_SUCCESS) {                                 \
+      const char *msg;                                            \
+      cuGetErrorName(result, &msg);                               \
+      std::cerr << "\n[KeOps] error: " #x " failed with error "   \
+                << msg << '\n' << '\n';                           \
+      exit(1);                                                     \
+    }                                                             \
+  } while(0)
+
+#define CUDA_SAFE_CALL(x)                                        \
+  do {                                                            \
+    CUresult result = x;                                          \
+    if (result != CUDA_SUCCESS) {                                 \
+      const char *msg;                                            \
+      cuGetErrorName(result, &msg);                               \
+      std::cerr << "\n[KeOps] error: " #x " failed with error "   \
+                << msg << '\n' << '\n';                           \
+      throw std::runtime_error("[KeOps] Cuda error.");            \
+    }                                                             \
+  } while(0)
+
+
+char *read_text_file(char const *path) {
+    char *buffer = 0;
+    long length;
+    FILE *f = fopen(path, "rb");
+    if (f) {
+        fseek(f, 0, SEEK_END);
+        length = ftell(f);
+        fseek(f, 0, SEEK_SET);
+        buffer = (char *) malloc((length + 1) * sizeof(char));
+        if (buffer) {
+            int res = fread(buffer, sizeof(char), length, f);
+        }
+        fclose(f);
+    }
+    buffer[length] = '\0';
+    return buffer;
+}
+
+
+
+template<typename TYPE>
+void load_args_FromDevice(CUdeviceptr &p_data, TYPE *out, TYPE *&out_d, int nargs, TYPE **arg, TYPE **&arg_d) {
+    CUDA_SAFE_CALL(cuMemAlloc(&p_data, sizeof(TYPE *) * nargs));
+    out_d = out;
+    arg_d = (TYPE **) p_data;
+    // copy array of pointers
+    CUDA_SAFE_CALL(cuMemcpyHtoD((CUdeviceptr) arg_d, arg, nargs * sizeof(TYPE *)));
+}
+
+
+template<typename TYPE>
+void
+load_args_FromHost(CUdeviceptr &p_data, TYPE *out, TYPE *&out_d, int nargs,
+                   TYPE **arg, TYPE **&arg_d,
+                   const std::vector< std::vector< int > > &argshape,
+                   int sizeout) {
+    std::vector<int> sizes(nargs);
+    int totsize = sizeout;
+    for (int k = 0; k < nargs; k++) {
+        sizes[k] = std::accumulate(argshape[k].begin(), argshape[k].end(), 1, std::multiplies< int >());
+        totsize += sizes[k];
+    }
+
+    CUDA_SAFE_CALL(cuMemAlloc(&p_data, sizeof(TYPE *) * nargs + sizeof(TYPE) * totsize));
+
+    arg_d = (TYPE **) p_data;
+    TYPE *dataloc = (TYPE *) (arg_d + nargs);
+
+    // host array of pointers to device data
+    std::vector<TYPE *> ph(nargs);
+
+    out_d = dataloc;
+    dataloc += sizeout;
+    for (int k = 0; k < nargs; k++) {
+        ph[k] = dataloc;
+        CUDA_SAFE_CALL(cuMemcpyHtoD((CUdeviceptr) dataloc, arg[k], sizeof(TYPE) * sizes[k]));
+        dataloc += sizes[k];
+    }
+
+    // copy array of pointers
+    CUDA_SAFE_CALL(cuMemcpyHtoD((CUdeviceptr) arg_d, ph.data(), nargs * sizeof(TYPE *)));
+}
diff --git a/keopscore/keopscore/windows_compilations/__init__.py b/keopscore/keopscore/windows_compilations/__init__.py
new file mode 100644
index 000000000..cc7a91321
--- /dev/null
+++ b/keopscore/keopscore/windows_compilations/__init__.py
@@ -0,0 +1,28 @@
+"""Initialize the windows_compilations package and create dictionaries with found includes/libs/dlls"""
+
+from .compile import compile
+from .compile_nvrtc_jit import compile_nvrtc_jit
+from .compile_pykeops_cpp_module import compile_pykeops_cpp_module
+from .compile_pykeops_nvrtc import compile_pykeops_nvrtc
+from .detection import (
+    dlls,
+    include_dirs,
+    keops_available,
+    lib_dirs,
+    lib_names,
+)
+from .globals import tmp_dir
+from .cuda_detection import cuda_available
+
+__all__ = [
+    "compile",
+    "tmp_dir",
+    "include_dirs",
+    "lib_dirs",
+    "lib_names",
+    "dlls",
+    "compile_pykeops_cpp_module",
+    "compile_nvrtc_jit",
+    "compile_pykeops_nvrtc",
+    "keops_available",
+]
diff --git a/keopscore/keopscore/windows_compilations/compile.py b/keopscore/keopscore/windows_compilations/compile.py
new file mode 100644
index 000000000..d01699e4a
--- /dev/null
+++ b/keopscore/keopscore/windows_compilations/compile.py
@@ -0,0 +1,134 @@
+from __future__ import annotations
+
+import os
+import shutil
+import sysconfig
+import uuid
+from pathlib import Path
+
+from .globals import tmp_dir
+
+_empty_list = []
+
+
+def compile(
+    source_file: os.PathLike,
+    project_name: str | None = None,
+    includes: list[os.PathLike] | None = _empty_list,
+    link_dirs: list[os.PathLike] | None = _empty_list,
+    links: list[str] | None = _empty_list,
+    macros: list[str] | None = _empty_list,
+    suffix: str = ".dll",
+    output_dir=".",
+    print_cmakelists=False,
+    show_cmake_commands_output=False,
+    clean_tmp_build_dir=True,
+):
+
+    output_dir = Path(output_dir)
+
+    if project_name is None:
+        project_name = "".join(Path(source_file).name.split(".")[:-1])
+
+    includes_str = ""
+    for include in includes:
+        includes_str += f'include_directories("{include!s}")\n'
+    includes_str = includes_str.replace("\\", "/")
+
+    link_dirs_str = ""
+    for link in link_dirs:
+        link_dirs_str += f'link_directories("{link!s}")\n'
+    link_dirs_str = link_dirs_str.replace("\\", "/")
+
+    macros_str = ""
+    for macro in macros:
+        macros_str += f"add_definitions({macro!s})\n"
+
+    if len(links) == 0:
+        links = ""
+
+    else:
+        inline_links = " ".join(links)
+        links = f"target_link_libraries(${{PROJECT_NAME}} {inline_links})"
+
+    with Path.open(Path(__file__).parent / "templates" / "CMakeLists.txt") as f:
+        content = f.read()
+
+    includes = includes_str
+    link_dirs = link_dirs_str
+    macros = macros_str
+    source_file = str(Path(source_file).resolve()).replace("\\", "/")
+
+    if "pyd" in suffix:
+        suffix = sysconfig.get_config_var("EXT_SUFFIX")
+
+    fields = [
+        "source_file",
+        "project_name",
+        "includes",
+        "link_dirs",
+        "links",
+        "macros",
+        "suffix",
+    ]
+
+    for field in fields:
+        content = content.replace(f"***{field}***", locals()[field])
+
+    import os
+    import subprocess
+
+    cwd = Path.cwd()
+
+    # Go to tmp dir
+
+    tmp_build = tmp_dir / str(uuid.uuid4())
+
+    tmp_build.mkdir()
+    if (tmp_build / "build").is_dir():
+        shutil.rmtree(tmp_build / "build")
+
+    tmp_build.mkdir(exist_ok=True)
+
+    with Path.open(Path(tmp_build) / "CMakeLists.txt", "w") as f:
+        f.write(content)
+
+    if print_cmakelists:
+        print(content)
+
+    os.chdir(tmp_build)
+    Path("build").mkdir()
+    os.chdir("build")
+
+    if not show_cmake_commands_output:
+
+        with Path.open(tmp_build / "log", "w") as log_file:
+
+            subprocess.check_call(["cmake", ".."], stdout=log_file)
+            subprocess.check_call(
+                ["cmake", "--build", ".", "--config", "Release"], stdout=log_file
+            )
+
+    else:
+        subprocess.check_call(["cmake", ".."])
+        subprocess.check_call(["cmake", "--build", ".", "--config", "Release"])
+
+    # Back to previous working directory
+    os.chdir(cwd)
+
+    # Define the source and destination directories
+    Path(output_dir).mkdir(exist_ok=True)
+
+    source_dir = tmp_build / "build" / "Release"
+
+    # Copy the contents of the source directory to the destination directory
+    for item in os.listdir(source_dir):
+        s = source_dir / item
+        d = output_dir / item
+
+        # Copy files or directories
+        if str(s).endswith(suffix):
+            shutil.copy2(s, d)
+
+    if clean_tmp_build_dir:
+        shutil.rmtree(tmp_build)
diff --git a/keopscore/keopscore/windows_compilations/compile_nvrtc_jit.py b/keopscore/keopscore/windows_compilations/compile_nvrtc_jit.py
new file mode 100644
index 000000000..7227bdd59
--- /dev/null
+++ b/keopscore/keopscore/windows_compilations/compile_nvrtc_jit.py
@@ -0,0 +1,38 @@
+from pathlib import Path
+
+from .compile import compile
+from .detection import (
+    include_dirs,
+    lib_dirs,
+    lib_names,
+)
+from .utils import find_package_location
+
+
+def compile_nvrtc_jit(build_folder):
+
+    keops_dir = Path(find_package_location("keopscore")).parent
+    source_file = keops_dir / "binders" / "nvrtc" / "nvrtc_jit_win.cpp"
+
+    macros = [
+        "-DMAXIDGPU=0",
+        "-DMAXTHREADSPERBLOCK0=1024",
+        "-DSHAREDMEMPERBLOCK0=49152",
+        "-DnvrtcGetTARGET=nvrtcGetCUBIN",
+        "-DnvrtcGetTARGETSize=nvrtcGetCUBINSize",
+        '-DARCHTAG="sm"',
+    ]
+
+    compile(
+        source_file=source_file,
+        project_name="nvrtc_jit",
+        macros=macros,
+        includes=[include_dirs[key] for key in ["keops", "cuda"]],
+        link_dirs=[lib_dirs[key] for key in ["cuda"]],
+        links=[lib_names[key] for key in ["cuda", "nvrtc", "cudart"]],
+        suffix=".dll",
+        output_dir=build_folder,
+        print_cmakelists=False,
+        show_cmake_commands_output=False,
+        clean_tmp_build_dir=False,
+    )
diff --git a/keopscore/keopscore/windows_compilations/compile_pykeops_cpp_module.py b/keopscore/keopscore/windows_compilations/compile_pykeops_cpp_module.py
new file mode 100644
index 000000000..47c8f56ba
--- /dev/null
+++ b/keopscore/keopscore/windows_compilations/compile_pykeops_cpp_module.py
@@ -0,0 +1,31 @@
+from .compile import compile
+from .detection import include_dirs, lib_dirs, lib_names
+
+
+def compile_pykeops_cpp_module(source_file, build_folder):
+    """This built-in compilation config serves to compile pykeops cpp modules
+
+    Parameters
+    ----------
+    source_file
+        Location of the source cpp file
+    build_folder
+        The KeOps build folder
+
+    """
+
+    compile(
+        source_file=source_file,
+        includes=[
+            include_dirs["python"],
+            include_dirs["pybind11"],
+            include_dirs["keops"],
+        ],
+        link_dirs=[lib_dirs["python"]],
+        links=[lib_names["python"]],
+        suffix=".pyd",
+        output_dir=build_folder,
+        print_cmakelists=False,
+        show_cmake_commands_output=False,
+        clean_tmp_build_dir=False,
+    )
diff --git a/keopscore/keopscore/windows_compilations/compile_pykeops_nvrtc.py b/keopscore/keopscore/windows_compilations/compile_pykeops_nvrtc.py
new file mode 100644
index 000000000..361f00362
--- /dev/null
+++ b/keopscore/keopscore/windows_compilations/compile_pykeops_nvrtc.py
@@ -0,0 +1,38 @@
+from pathlib import Path
+
+from .compile import compile
+from .detection import (
+    include_dirs,
+    lib_dirs,
+    lib_names,
+)
+from .utils import find_package_location
+
+
+def compile_pykeops_nvrtc(build_folder):
+
+    pykeops_dir = Path(find_package_location("pykeops")).parent
+    source_file = pykeops_dir / "common" / "keops_io" / "pykeops_nvrtc_win.cpp"
+
+    macros = [
+        "-DMAXIDGPU=0",  # TODO should be detected
+        "-DMAXTHREADSPERBLOCK0=1024",  # TODO should be detected
+        "-DSHAREDMEMPERBLOCK0=49152",  # TODO should be detected
+        "-DnvrtcGetTARGET=nvrtcGetCUBIN",  # TODO should be detected
+        "-DnvrtcGetTARGETSize=nvrtcGetCUBINSize",  # TODO should be detected
+        '-DARCHTAG="sm"',  # TODO should be detected
+    ]
+
+    compile(
+        source_file=source_file,
+        project_name="pykeops_nvrtc",
+        macros=macros,
+        includes=[include_dirs[key] for key in ["python", "pybind11", "keops", "cuda"]],
+        link_dirs=[lib_dirs[key] for key in ["python", "cuda"]],
+        links=[lib_names[key] for key in ["cuda", "nvrtc", "cudart", "python"]],
+        suffix=".pyd",
+        output_dir=build_folder,
+        print_cmakelists=False,
+        show_cmake_commands_output=False,
+        clean_tmp_build_dir=False,
+    )
diff --git a/keopscore/keopscore/windows_compilations/cuda_detection.py b/keopscore/keopscore/windows_compilations/cuda_detection.py
new file mode 100644
index 000000000..3f4b8f63d
--- /dev/null
+++ b/keopscore/keopscore/windows_compilations/cuda_detection.py
@@ -0,0 +1,76 @@
+r"""
+CUDA toolkit detection on Windows.
+
+CUDA_PATH environment variable must be set. It is usually set by the CUDA installer, if not it
+must point to a valid CUDA installation (typically C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\vX.Y).
+
+The detection looks for the following files:
+- cudart*.dll
+- nvrtc-builtins*.dll
+- nvcuda.dll (CUDA driver library, usually located in system32 folder)
+- include directory
+- lib/x64 directory containing cuda.lib, nvrtc.lib and cudart.lib
+"""
+
+import os
+from ctypes.util import find_library
+from pathlib import Path
+
+cuda_available = "CUDA_PATH" in os.environ
+
+
+def detect_cuda_toolkit():
+
+    output = {}
+
+    if cuda_available:
+
+        cuda_path = Path(
+            os.environ["CUDA_PATH"]
+        )  # base path for cuda installation (including bin, lib, include, etc.)
+
+        if find_library("nvcuda") is not None:  # NVCUDA is the main CUDA driver library
+            output["dll_cuda"] = find_library("nvcuda")
+
+        cuda_path = Path(
+            os.environ["CUDA_PATH"]
+        )  # base path for cuda installation (including bin, lib, include, etc.)
+
+        #################################################
+        # Detect relevant DLLs: cudart and nvrtc-builtins
+        #################################################
+
+        # Check both bin and bin/x64 directories for relevant DLLs
+        bin_dirs = [Path(cuda_path, "bin"), Path(cuda_path, "bin", "x64")]
+
+        for bin_dir in bin_dirs:
+            if bin_dir.is_dir():
+                for file in bin_dir.iterdir():
+                    if file.name.startswith("cudart") and file.name.endswith(".dll"):
+                        output["dll_cudart"] = str(file)
+                    if file.name.startswith("nvrtc-builtins") and file.name.endswith(
+                        ".dll"
+                    ):
+                        output["dll_nvrtc"] = str(file)
+
+        #################################################
+        # Detect include and lib directories
+        #################################################
+        cuda_include = Path(cuda_path, "include")
+        if cuda_include.is_dir():
+            output["include_dir"] = str(cuda_include)
+
+        cuda_libs = Path(cuda_path, "lib", "x64")
+        if cuda_libs.is_dir():
+            output["lib_dirs"] = str(cuda_libs)
+
+        #################################################
+        # Make sure that cudart, nvrtc and cuda libs are available
+        #################################################
+        output["lib_names"] = {}
+        for key in ["cuda", "nvrtc", "cudart"]:
+
+            if (cuda_libs / (key + ".lib")).is_file():
+                output["lib_names"][key] = key
+
+    return output
diff --git a/keopscore/keopscore/windows_compilations/cuda_detection/CMakeLists.txt b/keopscore/keopscore/windows_compilations/cuda_detection/CMakeLists.txt
new file mode 100644
index 000000000..30cca9752
--- /dev/null
+++ b/keopscore/keopscore/windows_compilations/cuda_detection/CMakeLists.txt
@@ -0,0 +1,127 @@
+cmake_minimum_required(VERSION 3.21)
+project(cuda_detection LANGUAGES CXX)
+
+# If you have multiple toolkits, pass -DCUDAToolkit_ROOT="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.6"
+find_package(CUDAToolkit REQUIRED)
+
+# --- Derive include and lib dirs ------------------------------------------------
+# include_dir: usually one path
+list(GET CUDAToolkit_INCLUDE_DIRS 0 CUDA_DETECT_INCLUDE_DIR)
+
+# lib_dirs: prefer .../lib/x64 if present, else CUDAToolkit_LIBRARY_DIR
+set(CUDA_DETECT_LIB_DIR "${CUDAToolkit_LIBRARY_DIR}")
+if(EXISTS "${CUDAToolkit_LIBRARY_DIR}/x64")
+  set(CUDA_DETECT_LIB_DIR "${CUDAToolkit_LIBRARY_DIR}/x64")
+endif()
+
+# bin dir for DLLs
+set(_BIN "${CUDAToolkit_BIN_DIR}")
+
+# --- Version tag for NVRTC-builtins (12.6 -> 126) ------------------------------
+string(REPLACE "." ";" _ver_list "${CUDAToolkit_VERSION}")
+list(GET _ver_list 0 _maj)
+list(LENGTH _ver_list _len)
+if(_len GREATER 1)
+  list(GET _ver_list 1 _min)
+else()
+  set(_min 0)
+endif()
+math(EXPR _TAG "${_maj} * 10 + ${_min}")
+
+# --- dll_cuda (driver DLL) -----------------------------------------------------
+# Prefer System32 (64-bit)
+find_file(CUDA_DETECT_DLL_CUDA
+  NAMES nvcuda.dll
+  PATHS "C:/Windows/System32" "C:/Windows/SysWOW64"  # fallback shows 32-bit if only that exists
+  NO_DEFAULT_PATH)
+
+# --- dll_cudart (runtime DLL) --------------------------------------------------
+# Try versioned names first, then any cudart64*.dll in the toolkit bin
+set(CUDA_DETECT_DLL_CUDART "")
+foreach(_cand
+  "cudart64_${_maj}${_min}.dll"
+  "cudart64.dll")
+  find_file(CUDA_DETECT_DLL_CUDART NAMES "${_cand}" HINTS "${_BIN}" NO_DEFAULT_PATH)
+  if(CUDA_DETECT_DLL_CUDART)
+    break()
+  endif()
+endforeach()
+if(NOT CUDA_DETECT_DLL_CUDART)
+  file(GLOB _cudart_glob LIST_DIRECTORIES OFF "${_BIN}/cudart64*.dll")
+  if(_cudart_glob)
+    list(GET _cudart_glob 0 CUDA_DETECT_DLL_CUDART)
+  endif()
+endif()
+
+# --- dll_nvrtc (you asked for the *builtins* DLL specifically) -----------------
+# Search for nvrtc-builtins matching our tag, else first available
+set(CUDA_DETECT_DLL_NVRTC "")
+file(GLOB _builtins_glob LIST_DIRECTORIES OFF "${_BIN}/nvrtc-builtins*.dll")
+if(_builtins_glob)
+  foreach(_dll IN LISTS _builtins_glob)
+    get_filename_component(_name "${_dll}" NAME)
+    if(_name MATCHES "nvrtc-builtins.*${_TAG}")
+      set(CUDA_DETECT_DLL_NVRTC "${_dll}")
+      break()
+    endif()
+  endforeach()
+  if(NOT CUDA_DETECT_DLL_NVRTC)
+    list(GET _builtins_glob 0 CUDA_DETECT_DLL_NVRTC)
+  endif()
+endif()
+
+# --- Sanity hints (optional warnings) -----------------------------------------
+if(NOT CUDA_DETECT_DLL_CUDA)
+  message(WARNING "nvcuda.dll not found in System32/SysWOW64.")
+endif()
+if(NOT CUDA_DETECT_DLL_CUDART)
+  message(WARNING "cudart64*.dll not found in ${_BIN}.")
+endif()
+if(NOT CUDA_DETECT_DLL_NVRTC)
+  message(WARNING "nvrtc-builtins*.dll not found in ${_BIN}.")
+endif()
+
+# --- Produce outputs in your desired schema -----------------------------------
+# lib_names are the import-library basenames
+set(_LIB_NAMES_PY "{'cuda': 'cuda', 'cudart': 'cudart', 'nvrtc': 'nvrtc'}")
+
+# Escape backslashes for Python/JSON strings
+function(_escape_for_py in_var out_var)
+  string(REPLACE "\\" "\\\\" _tmp "${${in_var}}")
+  set(${out_var} "${_tmp}" PARENT_SCOPE)
+endfunction()
+
+_escape_for_py(CUDA_DETECT_DLL_CUDA   _DLL_CUDA_PY)
+_escape_for_py(CUDA_DETECT_DLL_CUDART _DLL_CUDART_PY)
+_escape_for_py(CUDA_DETECT_DLL_NVRTC  _DLL_NVRTC_PY)
+_escape_for_py(CUDA_DETECT_INCLUDE_DIR _INCLUDE_PY)
+_escape_for_py(CUDA_DETECT_LIB_DIR     _LIBDIR_PY)
+
+# Python-style dict (exact key names as you requested)
+set(_DICT_PY
+  "{'dll_cuda': '${_DLL_CUDA_PY}', "
+  "'dll_cudart': '${_DLL_CUDART_PY}', "
+  "'dll_nvrtc': '${_DLL_NVRTC_PY}', "
+  "'include_dir': '${_INCLUDE_PY}', "
+  "'lib_dirs': '${_LIBDIR_PY}', "
+  "'lib_names': ${_LIB_NAMES_PY}}"
+)
+
+message(STATUS "CUDA_DETECTION=${_DICT_PY}")
+
+# Also emit a JSON file (cuda_detect.json) in the build dir for programmatic use
+# Note: using the same values; JSON requires quoted keys and escaped paths.
+set(_JSON
+  "{\n"
+  "  \"dll_cuda\": \"${_DLL_CUDA_PY}\",\n"
+  "  \"dll_cudart\": \"${_DLL_CUDART_PY}\",\n"
+  "  \"dll_nvrtc\": \"${_DLL_NVRTC_PY}\",\n"
+  "  \"include_dir\": \"${_INCLUDE_PY}\",\n"
+  "  \"lib_dirs\": \"${_LIBDIR_PY}\",\n"
+  "  \"lib_names\": {\"cuda\": \"cuda\", \"cudart\": \"cudart\", \"nvrtc\": \"nvrtc\"}\n"
+  "}\n")
+file(WRITE "${CMAKE_BINARY_DIR}/cuda_detect.json" "${_JSON}")
+
+# Dummy target so configure+build does something
+add_custom_target(show_detect ALL
+  COMMENT "CUDA detection complete. See message above or cuda_detect.json.")
diff --git a/keopscore/keopscore/windows_compilations/detection.py b/keopscore/keopscore/windows_compilations/detection.py
new file mode 100644
index 000000000..059d3c9fe
--- /dev/null
+++ b/keopscore/keopscore/windows_compilations/detection.py
@@ -0,0 +1,62 @@
+import sys
+import sysconfig
+from pathlib import Path
+
+import pybind11
+
+from .cuda_detection import detect_cuda_toolkit, cuda_available
+from .utils import find_package_location
+
+include_dirs = {}
+lib_dirs = {}
+lib_names = {}
+dlls = {}
+
+
+try:
+    location_keops_init = find_package_location("keopscore")
+    include_dirs["keops"] = str(Path(location_keops_init).parent)
+    keops_available = True
+except ImportError:
+    keops_available = False
+
+
+include_dirs["pybind11"] = pybind11.get_include()
+
+
+# Python
+include_dirs["python"] = sysconfig.get_path("include")
+
+if sys.platform == "win32":
+    # On Windows, get the path to the Python DLL
+    python_libs = Path(sysconfig.get_path("include")).parent / "libs"
+
+    # Get the path to the standard library (Lib)
+    if python_libs.is_dir():
+        lib_dirs["python"] = str(python_libs)
+
+        version = str(sys.version_info.major) + str(sys.version_info.minor)
+        if (Path(lib_dirs["python"]) / ("python" + version + ".lib")).is_file():
+            lib_names["python"] = "python" + version
+else:
+    # On Unix-like systems, use sysconfig to get the library directory and name
+    lib_dirs["python"] = Path(sysconfig.get_config_var("LIBDIR"))
+    lib_names["python"] = sysconfig.get_config_var("LDLIBRARY")
+
+
+# Cuda
+if cuda_available:
+    cuda_config = detect_cuda_toolkit()
+    for key in ["cuda", "nvrtc", "cudart"]:
+
+        if f"dll_{key}" in cuda_config:
+            dlls[key] = cuda_config[f"dll_{key}"]
+
+        if key in cuda_config["lib_names"]:
+            lib_names[key] = cuda_config["lib_names"][key]
+
+    if "include_dir" in cuda_config:
+        include_dirs["cuda"] = cuda_config["include_dir"]
+
+    if "lib_dirs" in cuda_config:
+        lib_dirs["cuda"] = cuda_config["lib_dirs"]
diff --git a/keopscore/keopscore/windows_compilations/globals.py b/keopscore/keopscore/windows_compilations/globals.py
new file mode 100644
index 000000000..9eb9504c8
--- /dev/null
+++ b/keopscore/keopscore/windows_compilations/globals.py
@@ -0,0 +1,5 @@
+from pathlib import Path
+
+tmp_dir = Path.home() / "tmp" / "cppcompile_windows"
+(Path.home() / "tmp").mkdir(exist_ok=True)
+tmp_dir.mkdir(exist_ok=True)
diff --git a/keopscore/keopscore/windows_compilations/templates/CMakeLists.txt b/keopscore/keopscore/windows_compilations/templates/CMakeLists.txt
new file mode 100644
index 000000000..458a6b5af
--- /dev/null
+++ b/keopscore/keopscore/windows_compilations/templates/CMakeLists.txt
@@ -0,0 +1,39 @@
+cmake_minimum_required(VERSION 3.10)
+project(***project_name***)
+
+# Set C++ standard
+set(CMAKE_CXX_STANDARD 11)
+
+# Set compiler options
+if(MSVC)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /LD /Ox /GL /openmp")
+else()
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -O3 -fopenmp")
+endif()
+
+# Preprocessor macros
+***macros***
+
+# Include directories
+***includes***
+
+# Link directories
+***link_dirs***
+
+# Source file
+set(SOURCE_FILE "***source_file***")
+
+# Add shared library
+add_library(${PROJECT_NAME} SHARED ${SOURCE_FILE})
+
+# Link
+***links***
+
+# Set the output path for the shared library and ensure the .pyd extension
+set_target_properties(${PROJECT_NAME} PROPERTIES
+    LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
+    OUTPUT_NAME "***project_name***"
+    SUFFIX "***suffix***"  # Ensure the correct file extension for Python modules
+    MSVC_WARNING_LEVEL 0
+    MSVC_WARNING_DISABLE 8029 # Warning because we are building in a tmp dir
+)
diff --git a/keopscore/keopscore/windows_compilations/utils.py b/keopscore/keopscore/windows_compilations/utils.py
new file mode 100644
index 000000000..6a005d08e
--- /dev/null
+++ b/keopscore/keopscore/windows_compilations/utils.py
@@ -0,0 +1,31 @@
+import importlib.util
+
+
+def find_package_location(package_name: str) -> str:
+    """Find the __init__ file of a given package
+
+    This function does not import the package, it was written to avoid circular
+    imports with KeOps
+
+    Parameters
+    ----------
+    package_name
+        The name of the package
+
+    Returns
+    -------
+    str
+        The path to the package
+
+    Raises
+    ------
+    ImportError
+        If the package cannot be loaded
+
+    """
+    spec = importlib.util.find_spec(package_name)
+    if spec.origin:
+        return spec.origin
+    else:
+        message = f"Package '{package_name}' not found."
+        raise ImportError(message)
diff --git a/keopscore/setup.py b/keopscore/setup.py
index a0854b4a2..77c9eaaa8 100644
--- a/keopscore/setup.py
+++ b/keopscore/setup.py
@@ -11,6 +11,16 @@
 with open(os.path.join(here, "keopscore", "keops_version"), encoding="utf-8") as v:
     current_version = v.read().rstrip()
 
+# TODO fix this (issues with symlinks on windows ? -> moving to pyproject.toml ?)
+if os.name == "nt":
+    with open(os.path.join(here, "..", "keops_version"), encoding="utf-8") as v:
+        current_version = v.read().rstrip()
+    # copy the content to keopscore/keops_version
+    with open(
+        os.path.join(here, "keopscore", "keops_version"), "w", encoding="utf-8"
+    ) as v:
+        v.write(current_version)
+
 # Get the long description from the README file
 with open(path.join(here, "keopscore", "readme.md"), encoding="utf-8") as f:
     long_description = f.read()
@@ -59,6 +69,7 @@
         "keopscore.mapreduce.cpu",
         "keopscore.mapreduce.gpu",
         "keopscore.utils",
+        "keopscore.windows_compilations",
     ],
     package_data={
         "keopscore": [
@@ -73,8 +84,16 @@
             "include/Ranges.h",
             "include/Sizes.h",
             "include/utils_pe.h",
+            "binders/nvrtc/keops_nvrtc_win.cpp",
+            "binders/nvrtc/nvrtc_jit_win.cpp",
+            "include/CudaSizes_win.h",
+            "include/ranges_utils_win.h",
+            "include/Ranges_win.h",
+            "include/Sizes_win.h",
+            "include/utils_pe_win.h",
+            "windows_compilations/templates/CMakeLists.txt",
         ],
     },
-    install_requires=[],
+    install_requires=["cmake"],
     extras_require={},
 )
diff --git a/pykeops/pykeops/common/keops_io/LoadKeOps_cpp.py b/pykeops/pykeops/common/keops_io/LoadKeOps_cpp.py
index 7b632f3c9..961568023 100644
--- a/pykeops/pykeops/common/keops_io/LoadKeOps_cpp.py
+++ b/pykeops/pykeops/common/keops_io/LoadKeOps_cpp.py
@@ -11,6 +11,8 @@
 from keopscore.utils.misc_utils import KeOps_OS_Run
 from pykeops.config import pykeops_cpp_name, python_includes
 
+# TODO limit code duplication for get_pybind11_code()
+
 
 class LoadKeOps_cpp_class(LoadKeOps):
     def __init__(self, *args, fast_init=False):
@@ -33,7 +35,16 @@ def init_phase1(self):
                 flush=True,
                 end="",
             )
-            KeOps_OS_Run(compile_command)
+
+            if os.name == "nt":
+                from keopscore.windows_compilations import compile_pykeops_cpp_module
+
+                compile_pykeops_cpp_module(
+                    source_file=srcname,
+                    build_folder=get_build_folder(),
+                )
+            else:
+                KeOps_OS_Run(compile_command)
             pyKeOps_Message("OK", use_tag=False, flush=True)
 
     def init_phase2(self):
@@ -70,7 +81,8 @@ def call_keops(self, nx, ny):
         )
 
     def get_pybind11_code(self):
-        return f"""
+        if os.name != "nt":
+            return f"""
 #include "{self.params.source_name}"
 
 #include <pybind11/pybind11.h>
@@ -152,7 +164,7 @@ def get_pybind11_code(self):
     }}
 
 
-    return launch_keops_cpu_{self.params.tag}< TYPE >(dimY,
+    return launch_keops_cpu_{self.params.tag} < TYPE >(dimY,
                                                       nx,
                                                       ny,
                                                       tagI,
@@ -181,6 +193,120 @@ def get_pybind11_code(self):
 }}                     
             """
 
+        else:
+            return f"""
+#include "{self.params.source_name}"
+
+#include <pybind11/pybind11.h>
+namespace py = pybind11;
+
+template < typename TYPE >
+int launch_pykeops_{self.params.tag}_cpu(int dimY, int nx, int ny,
+                                         int tagI, int tagZero, int use_half,
+                                         int dimred,
+                                         int use_chunk_mode,
+                                         py::tuple py_indsi, py::tuple py_indsj, py::tuple py_indsp,
+                                         int dimout,
+                                         py::tuple py_dimsx, py::tuple py_dimsy, py::tuple py_dimsp,
+                                         py::tuple py_ranges,
+                                         py::tuple py_shapeout,
+                                         long out_void,
+                                         py::tuple py_arg,
+                                         py::tuple py_argshape){{
+
+    /*------------------------------------*/
+    /*         Cast input args            */
+    /*------------------------------------*/
+
+    std::vector< int > indsi_v(py_indsi.size());
+    for (auto i = 0; i < py_indsi.size(); i++)
+        indsi_v[i] = py::cast< int >(py_indsi[i]);
+
+
+    std::vector< int > indsj_v(py_indsj.size());
+    for (auto i = 0; i < py_indsj.size(); i++)
+        indsj_v[i] = py::cast< int >(py_indsj[i]);
+
+
+    std::vector< int > indsp_v(py_indsp.size());
+    for (auto i = 0; i < py_indsp.size(); i++)
+        indsp_v[i] = py::cast< int >(py_indsp[i]);
+
+
+    std::vector< int > dimsx_v(py_dimsx.size());
+    for (auto i = 0; i < py_dimsx.size(); i++)
+        dimsx_v[i] = py::cast< int >(py_dimsx[i]);
+
+
+    std::vector< int > dimsy_v(py_dimsy.size());
+    for (auto i = 0; i < py_dimsy.size(); i++)
+        dimsy_v[i] = py::cast< int >(py_dimsy[i]);
+        
+    
+    std::vector< int > dimsp_v(py_dimsp.size());
+    for (auto i = 0; i < py_dimsp.size(); i++)
+        dimsp_v[i] = py::cast< int >(py_dimsp[i]);
+
+
+    // Cast the ranges arrays
+    std::vector< int* > ranges_v(py_ranges.size());
+    for (int i = 0; i < py_ranges.size(); i++)
+        ranges_v[i] = (int*) py::cast< long >(py_ranges[i]);
+    int **ranges = (int**) ranges_v.data();
+    
+    std::vector< int > shapeout_v(py_shapeout.size());
+    for (auto i = 0; i < py_shapeout.size(); i++)
+        shapeout_v[i] = py::cast< int >(py_shapeout[i]);
+    
+    TYPE *out = (TYPE*) out_void;
+    // std::cout << "out_ptr : " << (long) out << std::endl;
+    
+    std::vector< TYPE* > arg_v(py_arg.size());
+    for (int i = 0; i < py_arg.size(); i++)
+        arg_v[i] = (TYPE*) py::cast< long >(py_arg[i]);
+    TYPE **arg = (TYPE**) arg_v.data();
+    
+    std::vector< std::vector< int > > argshape_v(py_argshape.size());
+    for (auto i = 0; i < py_argshape.size(); i++){{
+        py::tuple tmp = py_argshape[i];
+        std::vector< int > tmp_v(tmp.size());
+        for (auto j =0; j < tmp.size(); j++)
+            tmp_v[j] = py::cast< int >(tmp[j]);
+        argshape_v[i] = tmp_v;
+    }}
+
+
+    return launch_keops_cpu_{self.params.tag}< TYPE >(dimY,
+                                                      nx,
+                                                      ny,
+                                                      tagI,
+                                                      tagZero,
+                                                      use_half,
+                                                      dimred,
+                                                      use_chunk_mode,
+                                                      indsi_v,
+                                                      indsj_v,
+                                                      indsp_v,
+                                                      dimout,
+                                                      dimsx_v,
+                                                      dimsy_v,
+                                                      dimsp_v,
+                                                      ranges,
+                                                      shapeout_v,
+                                                      out,
+                                                      arg,
+                                                      argshape_v);
+
+}}
+
+PYBIND11_MODULE(pykeops_cpp_{self.params.tag}, m) {{
+    m.doc() = "pyKeOps: KeOps for pytorch through pybind11 (pytorch flavour).";
+    m.def("launch_pykeops_cpu", &launch_pykeops_{self.params.tag}_cpu < {cpp_dtype[self.params.dtype]} >, "Entry point to keops.");
+}}                     
+            """.replace(
+                "long", "int64_t"
+            )
+
 
 LoadKeOps_cpp = Cache_partial(
     LoadKeOps_cpp_class, use_cache_file=True, save_folder=get_build_folder()
diff --git a/pykeops/pykeops/common/keops_io/LoadKeOps_nvrtc.py b/pykeops/pykeops/common/keops_io/LoadKeOps_nvrtc.py
index 95ef25489..651d8d9b7 100644
--- a/pykeops/pykeops/common/keops_io/LoadKeOps_nvrtc.py
+++ b/pykeops/pykeops/common/keops_io/LoadKeOps_nvrtc.py
@@ -66,7 +66,7 @@ def call_keops(self, nx, ny):
             self.params.dimsp,
             self.ranges_ptr_new,
             self.outshape,
-            self.out_ptr,
+            self.out_ptr if os.name != "nt" else (self.out_ptr,),
             self.args_ptr_new,
             self.argshapes_new,
         )
@@ -85,7 +85,13 @@ def compile_jit_binary():
         dllname=pykeops.config.pykeops_nvrtc_name(type="target"),
     )
     pyKeOps_Message("Compiling nvrtc binder for python ... ", flush=True, end="")
-    KeOps_OS_Run(compile_command)
+    if os.name == "nt":
+        from keopscore.windows_compilations import compile_pykeops_nvrtc
+
+        compile_pykeops_nvrtc(build_folder=get_build_folder())
+
+    else:
+        KeOps_OS_Run(compile_command)
     pyKeOps_Message("OK", use_tag=False, flush=True)
 
 
diff --git a/pykeops/pykeops/common/keops_io/pykeops_nvrtc_win.cpp b/pykeops/pykeops/common/keops_io/pykeops_nvrtc_win.cpp
new file mode 100644
index 000000000..20d8c3d3b
--- /dev/null
+++ b/pykeops/pykeops/common/keops_io/pykeops_nvrtc_win.cpp
@@ -0,0 +1,148 @@
+// g++ -std=c++11  -shared -fPIC -O3 -fpermissive -lcuda -lnvrtc  -L/usr/lib -L/opt/cuda/lib64 -L/opt/cuda/targets/x86_64-linux/lib/ -I/opt/cuda/targets/x86_64-linux/include/ -I /home/bcharlier/projets/keops/keops/keopscore/ -I/usr/include/python3.10/ -DnvrtcGetTARGET=nvrtcGetCUBIN -DnvrtcGetTARGETSize=nvrtcGetCUBINSize -DARCHTAG=\"sm\"  keops_io.cpp -o keops_io.cpython-310-x86_64-linux-gnu.so
+
+
+#include <binders/nvrtc/keops_nvrtc_win.cpp>
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+template< typename TYPE >
+class KeOps_module_python : public KeOps_module< TYPE > {
+public:
+
+    using KeOps_module< TYPE >::KeOps_module;
+
+
+    int operator()(int tagHostDevice, int dimY, int nx, int ny,
+                   int tagI, int tagZero, int use_half,
+                   int tag1D2D, int dimred,
+                   int cuda_block_size, int use_chunk_mode,
+                   py::tuple py_indsi, py::tuple py_indsj, py::tuple py_indsp,
+                   int dimout,
+                   py::tuple py_dimsx, py::tuple py_dimsy, py::tuple py_dimsp,
+                   py::tuple py_ranges,
+                   py::tuple py_shapeout,
+                   py::tuple out_void,
+                   py::tuple py_arg,
+                   py::tuple py_argshape
+    ) {
+
+        /*------------------------------------*/
+        /*         Cast input args            */
+        /*------------------------------------*/
+
+        std::vector< int > indsi_v(py_indsi.size());
+        for (auto i = 0; i < py_indsi.size(); i++)
+            indsi_v[i] = py::cast< int >(py_indsi[i]);
+
+
+        std::vector< int > indsj_v(py_indsj.size());
+        for (auto i = 0; i < py_indsj.size(); i++)
+            indsj_v[i] = py::cast< int >(py_indsj[i]);
+
+
+        std::vector< int > indsp_v(py_indsp.size());
+        for (auto i = 0; i < py_indsp.size(); i++)
+            indsp_v[i] = py::cast< int >(py_indsp[i]);
+
+
+        std::vector< int > dimsx_v(py_dimsx.size());
+        for (auto i = 0; i < py_dimsx.size(); i++)
+            dimsx_v[i] = py::cast< int >(py_dimsx[i]);
+
+
+        std::vector< int > dimsy_v(py_dimsy.size());
+        for (auto i = 0; i < py_dimsy.size(); i++)
+            dimsy_v[i] = py::cast< int >(py_dimsy[i]);
+
+
+        std::vector< int > dimsp_v(py_dimsp.size());
+        for (auto i = 0; i < py_dimsp.size(); i++)
+            dimsp_v[i] = py::cast< int >(py_dimsp[i]);
+
+
+        // Cast the ranges arrays
+        std::vector< int * > ranges_v(py_ranges.size());
+        for (int i = 0; i < py_ranges.size(); i++)
+            ranges_v[i] = (int *) py::cast< int64_t >(py_ranges[i]);
+        int **ranges = (int **) ranges_v.data();
+
+        // for (auto i: ranges_v)
+        //    std::cout << " " <<  (long) i << " ";
+        // std::cout << std::endl;
+
+        //for (auto i=0; i<7; i++)
+        //   std::cout << " " <<  (long) ranges[i] << " ";
+        //std::cout << std::endl;
+
+        std::vector< int > shapeout_v(py_shapeout.size());
+        for (auto i = 0; i < py_shapeout.size(); i++)
+            shapeout_v[i] = py::cast< int >(py_shapeout[i]);
+
+        TYPE *out = (TYPE *)py::cast< int64_t >(out_void[0]);
+        // std::cout << "out_ptr : " << (long) out << std::endl;
+
+        std::vector < TYPE * > arg_v(py_arg.size());
+        for (int i = 0; i < py_arg.size(); i++)
+            arg_v[i] = (TYPE *) py::cast< int64_t >(py_arg[i]);
+        TYPE **arg = (TYPE **) arg_v.data();
+
+        std::vector <std::vector< int >> argshape_v(py_argshape.size());
+        for (auto i = 0; i < py_argshape.size(); i++) {
+            py::tuple tmp = py_argshape[i];
+            std::vector< int > tmp_v(tmp.size());
+            for (auto j = 0; j < tmp.size(); j++)
+                tmp_v[j] = py::cast< int >(tmp[j]);
+            argshape_v[i] = tmp_v;
+        }
+
+//        for (auto i : argshape_v)
+//            for (auto j : i)
+//                std::cout << j << " " ;
+
+        return KeOps_module< TYPE >::launch_kernel(tagHostDevice,
+                                                   dimY,
+                                                   nx,
+                                                   ny,
+                                                   tagI,
+                                                   tagZero,
+                                                   use_half,
+                                                   tag1D2D,
+                                                   dimred,
+                                                   cuda_block_size,
+                                                   use_chunk_mode,
+                                                   indsi_v,
+                                                   indsj_v,
+                                                   indsp_v,
+                                                   dimout,
+                                                   dimsx_v,
+                                                   dimsy_v,
+                                                   dimsp_v,
+                                                   ranges,
+                                                   shapeout_v,
+                                                   out,
+                                                   arg,
+                                                   argshape_v);
+    }
+
+};
+/////////////////////////////////////////////////////////////////////////////////
+//                    PyBind11 entry point                                     //
+/////////////////////////////////////////////////////////////////////////////////
+
+
+PYBIND11_MODULE(pykeops_nvrtc, m) {
+m.doc() = "pyKeOps: KeOps for pytorch through pybind11 (pytorch flavour).";
+
+py::class_< KeOps_module_python< float > >(m, "KeOps_module_float")
+.def(py::init<int, int, const char *>())
+.def("__call__", &KeOps_module_python< float >::operator());
+
+py::class_< KeOps_module_python< double > >(m, "KeOps_module_double")
+.def(py::init<int, int, const char *>())
+.def("__call__", &KeOps_module_python< double >::operator());
+
+py::class_< KeOps_module_python< half2 > >(m, "KeOps_module_half2")
+.def(py::init<int, int, const char *>())
+.def("__call__", &KeOps_module_python< half2 >::operator());
+}
diff --git a/pykeops/pykeops/common/utils.py b/pykeops/pykeops/common/utils.py
index 104485ee9..d4d37fe7d 100644
--- a/pykeops/pykeops/common/utils.py
+++ b/pykeops/pykeops/common/utils.py
@@ -1,8 +1,3 @@
-import fcntl
-import functools
-import importlib.util
-import os
-
 import pykeops.config
 
 c_type = dict(float16="half2", float32="float", float64="double")
diff --git a/pykeops/setup.py b/pykeops/setup.py
index d8154d91f..9dc2c2fd1 100644
--- a/pykeops/setup.py
+++ b/pykeops/setup.py
@@ -12,6 +12,16 @@
 with open(os.path.join(here, "pykeops", "keops_version"), encoding="utf-8") as v:
     current_version = v.read().rstrip()
 
+# TODO fix this (issues with symlinks on windows ? -> moving to pyproject.toml ?)
+if os.name == "nt":
+    with open(os.path.join(here, "..", "keops_version"), encoding="utf-8") as v:
+        current_version = v.read().rstrip()
+    # copy the content to pykeops/keops_version
+    with open(
+        os.path.join(here, "pykeops", "keops_version"), "w", encoding="utf-8"
+    ) as v:
+        v.write(current_version)
+
 # Get the long description from the README file
 with open(path.join(here, "pykeops", "readme.md"), encoding="utf-8") as f:
     long_description = f.read()
@@ -62,6 +72,7 @@
             "licence.txt",
             "keops_version",
             "common/keops_io/pykeops_nvrtc.cpp",
+            "common/keops_io/pykeops_nvrtc_win.cpp",
         ],
     },
     install_requires=["numpy", "pybind11", "keopscore"],