From a780e01bd897a1cc0a1ef5bccafec85a59187056 Mon Sep 17 00:00:00 2001
From: Liam Marsh <liam.marsh@epfl.ch>
Date: Fri, 14 Nov 2025 15:22:00 +0100
Subject: [PATCH 1/7] feat: speed up python implementation of laplacian kernel

---
 qstack/regression/local_kernels.py | 67 +++++++++++++++++++++++++++++-
 tests/test_kernels.py              | 15 ++++++-
 2 files changed, 80 insertions(+), 2 deletions(-)

diff --git a/qstack/regression/local_kernels.py b/qstack/regression/local_kernels.py
index bb076b2f..d9270049 100644
--- a/qstack/regression/local_kernels.py
+++ b/qstack/regression/local_kernels.py
@@ -13,6 +13,69 @@
 from qstack.regression import __path__ as REGMODULE_PATH
 
 
+RAM_BATCHING_SIZE = 1024**3 * 5  # 5GiB
+def compute_distance_matrix(R1,R2):
+    f"""Computes the manhattan-distance matrix (||r_1 - r_2||_1) between the samples of R1 and R2,
+    using a batched python/numpy implementation.
+
+    Args:
+        R1 (numpy ndarray): First set of samples (can be multi-dimensional).
+        R2 (numpy ndarray): Second set of samples.
+
+    Returns:
+        numpy ndarray: squared-distance matrix of shape (len(R1), len(R2)).
+
+    Raises:
+        RuntimeError: If X and Y have incompatible shapes.
+    """
+    if R1.ndim != R2.ndim or R1.shape[1:] != R2.shape[1:]:
+        raise RuntimeError(f'incompatible shapes for R1 ({R1.shape:r}) and R2 ({R2.shape:r})')
+    # this function is a memory-efficient implementation of the following code
+    # R1v = R1.reshape(R1.shape[0], -1)
+    # R2v = R2.reshape(R2.shape[0], -1)
+    # return np.sum( (R1v[:,None]-R2v[None,:])**2 , axis=2)
+
+    # determine batch size (batch should divide the larger dimention)
+    if R1.shape[0] < R2.shape[0]:
+        transpose_flag = True
+        R2,R1 = R1,R2
+    else:
+        transpose_flag = False
+    dtype=np.result_type(R1,R2)
+    out = np.zeros((R1.shape[0], R2.shape[0]), dtype=dtype)
+
+    # possible weirdness: how is the layout of dtype done if dtype.alignment != dtype.itemsize?
+    batch_size = int(np.floor(RAM_BATCHING_SIZE/ (dtype.itemsize * np.prod(R2.shape))))
+
+    if batch_size == 0:
+        batch_size = 1
+
+    if min(R1.shape[0],R2.shape[0]) == 0 or batch_size >= R1.shape[0]:
+        dists = R1[:,None]-R2[None,:]
+        #np.pow(dists, 2, out=dists)
+        np.abs(dists, out=dists)
+        out = np.sum(dists, axis=tuple(range(2,dists.ndim)))
+    else:
+        n_batches = int(np.ceil(R1.shape[0] / batch_size))
+        dists = np.zeros( (batch_size, *R2.shape), dtype=dtype )
+        for batch_i in range(n_batches):
+            batch_start = batch_i*batch_size
+            this_batch_size = min(batch_size, R1.shape[0]-batch_start)
+
+            R1_view = R1[batch_start : batch_start + this_batch_size, None, ...]
+            np.subtract(R1_view, R2[None,:], out=dists[:this_batch_size])
+            assert not (dists[:this_batch_size]==0).all()
+            #np.pow(dists[:this_batch_size], 2, out=dists[:this_batch_size])  # For Euclidean distance
+            np.abs(dists[:this_batch_size], out=dists[:this_batch_size])
+            assert not (dists[:this_batch_size]==0).all()
+            np.sum(dists[:this_batch_size], out=out[batch_start : batch_start+this_batch_size], axis=tuple(range(2,dists.ndim)))
+            if batch_i == n_batches-1:
+                assert batch_start + this_batch_size == R1.shape[0]
+
+    if transpose_flag:
+        out = out.T
+    return out
+
 def custom_laplacian_kernel(X, Y, gamma):
     """Compute Laplacian kernel between X and Y using Python implementation.
 
@@ -39,7 +102,9 @@ def cdist(X, Y):
             d = np.sum(d, axis=tuple(range(1, len(d.shape))))
             K[i,:] = d
         return K
-    K = -gamma * cdist(X, Y)
+    # if not np.allclose(compute_distance_matrix(X,Y), cdist(X, Y)):
+    #     import pdb; pdb.set_trace()
+    K = -gamma * compute_distance_matrix(X,Y)
     np.exp(K, out=K)
     return K
 
diff --git a/tests/test_kernels.py b/tests/test_kernels.py
index 69d546f9..845642ae 100755
--- a/tests/test_kernels.py
+++ b/tests/test_kernels.py
@@ -1,7 +1,8 @@
 #!/usr/bin/env python3
 
+from time import perf_counter
 import numpy as np
-from qstack.regression import kernel
+from qstack.regression import kernel, local_kernels
 
 
 def test_local_kernels():
@@ -26,6 +27,13 @@ def test_local_kernels():
     K_dot_good = np.array([[1.1760054, 1.48883663], [0.22067605, 0.35151164]])
     K_cos_good = np.array([[0.85579088, 0.91287375], [0.22883447, 0.30712225]])
 
+    X_huge = np.concatenate([X]*1_000, axis=1)
+    X_huge = np.concatenate([X_huge]*1000, axis=0)
+    Y_huge = np.concatenate([Y]*1_000, axis=1)
+    Y_huge = np.concatenate([Y_huge]*50, axis=0)
+    K_L_good_huge = np.concatenate([K_L_good]*1000, axis=0)
+    K_L_good_huge = np.concatenate([K_L_good_huge]*50, axis=1)
+
     for akernel in ['G', 'G_sklearn', 'G_custom_c']:
         K = kernel.kernel(X, Y, akernel=akernel, sigma=2.0)
         assert np.allclose(K, K_G_good)
@@ -34,6 +42,11 @@ def test_local_kernels():
         K = kernel.kernel(X, Y, akernel=akernel, sigma=2.0)
         assert np.allclose(K, K_L_good)
 
+    local_kernels.RAM_BATCHING_SIZE = 1024**2 * 50  # 50MiB
+    for akernel in ['L_custom_c', 'L_custom_py', 'L', 'L_sklearn']:
+        K = kernel.kernel(X_huge, Y_huge, akernel=akernel, sigma=2.0*1000)
+        assert np.allclose(K, K_L_good_huge)
+
     K = kernel.kernel(X.reshape((2,2,2)), Y.reshape((2,2,2)), akernel='L_custom_py', sigma=2.0)
     assert np.allclose(K, K_L_good)
 

From ce9f5a612c90adc2f7db64ce32a0ee99c070559e Mon Sep 17 00:00:00 2001
From: Liam Marsh <liam.marsh@epfl.ch>
Date: Fri, 14 Nov 2025 20:00:13 +0100
Subject: [PATCH 2/7] fix: make ruff happy

---
 qstack/regression/local_kernels.py | 8 +++-----
 tests/test_kernels.py              | 1 -
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/qstack/regression/local_kernels.py b/qstack/regression/local_kernels.py
index d9270049..1a0fbbd0 100644
--- a/qstack/regression/local_kernels.py
+++ b/qstack/regression/local_kernels.py
@@ -15,7 +15,9 @@
 
 RAM_BATCHING_SIZE = 1024**3 * 5  # 5GiB
 def compute_distance_matrix(R1,R2):
-    f"""Computes the manhattan-distance matrix (||r_1 - r_2||_1) between the samples of R1 and R2,
+    """Compute the manhattan-distance matrix.
+
+    This computes (||r_1 - r_2||_1) between the samples of R1 and R2,
     using a batched python/numpy implementation.
 
     Args:
@@ -64,13 +66,9 @@ def compute_distance_matrix(R1,R2):
 
             R1_view = R1[batch_start : batch_start + this_batch_size, None, ...]
             np.subtract(R1_view, R2[None,:], out=dists[:this_batch_size])
-            assert not (dists[:this_batch_size]==0).all()
             #np.pow(dists[:this_batch_size], 2, out=dists[:this_batch_size])  # For Euclidean distance
             np.abs(dists[:this_batch_size], out=dists[:this_batch_size])
-            assert not (dists[:this_batch_size]==0).all()
             np.sum(dists[:this_batch_size], out=out[batch_start : batch_start+this_batch_size], axis=tuple(range(2,dists.ndim)))
-            if batch_i == n_batches-1:
-                assert batch_start + this_batch_size == R1.shape[0]
 
     if transpose_flag:
         out = out.T
diff --git a/tests/test_kernels.py b/tests/test_kernels.py
index 845642ae..82133fc8 100755
--- a/tests/test_kernels.py
+++ b/tests/test_kernels.py
@@ -1,6 +1,5 @@
 #!/usr/bin/env python3
 
-from time import perf_counter
 import numpy as np
 from qstack.regression import kernel, local_kernels
 

From 5675d381bf7acdee26909f05fecf28f875951f99 Mon Sep 17 00:00:00 2001
From: Liam Marsh <liam.marsh@epfl.ch>
Date: Fri, 14 Nov 2025 20:12:37 +0100
Subject: [PATCH 3/7] fix: make ruff happy about setup.py (also sometimes it's
 wrong)

---
 setup.py | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/setup.py b/setup.py
index 5f97a4bc..0d090ec4 100644
--- a/setup.py
+++ b/setup.py
@@ -1,15 +1,22 @@
-from setuptools import setup, find_packages, Extension
+from setuptools import setup, Extension
 import subprocess
-import os, tempfile, shutil
+import os
+import tempfile
+import shutil
+
+# ruff: noqa: S607  # look, if people are installing qstack while using a borked PATH, that's on them
+# ruff: noqa: D100  # this is a setup.py file, no docstring is needed
 
 VERSION="0.0.1"
 
 def get_git_version_hash():
     """Get tag/hash of the latest commit.
-    Thanks to https://gist.github.com/nloadholtes/07a1716a89b53119021592a1d2b56db8"""
+
+    Thanks to https://gist.github.com/nloadholtes/07a1716a89b53119021592a1d2b56db8
+    """
     try:
         p = subprocess.Popen(["git", "describe", "--tags", "--dirty", "--always"], stdout=subprocess.PIPE)
-    except EnvironmentError:
+    except OSError:
         return VERSION + "+unknown"
     version = p.communicate()[0]
     print(version)
@@ -17,8 +24,10 @@ def get_git_version_hash():
 
 
 def check_for_openmp():
-    """Check if there is OpenMP available
-    Thanks to https://stackoverflow.com/questions/16549893/programatically-testing-for-openmp-support-from-a-python-setup-script"""
+    """Check if there is OpenMP available.
+
+    Thanks to https://stackoverflow.com/questions/16549893/programatically-testing-for-openmp-support-from-a-python-setup-script
+    """
     omp_test = 'void main() { }'
     tmpdir = tempfile.mkdtemp()
     curdir = os.getcwd()
@@ -27,7 +36,7 @@ def check_for_openmp():
     with open(filename, 'w') as file:
         file.write(omp_test)
     with open(os.devnull, 'w') as fnull:
-        result = subprocess.call(['cc', '-fopenmp', '-lgomp', filename], stdout=fnull, stderr=fnull)
+        result = subprocess.call(['cc', '-fopenmp', '-lgomp', filename], stdout=fnull, stderr=fnull)  # noqa: S603  (filename is hard-coded earlier in this function)
     os.chdir(curdir)
     shutil.rmtree(tmpdir)
     return not result
@@ -41,6 +50,6 @@ def check_for_openmp():
         ext_modules=[Extension('qstack.regression.lib.manh',
                                ['qstack/regression/lib/manh.c'],
                                extra_compile_args=['-fopenmp', '-std=gnu11'] if openmp_enabled else ['-std=gnu11'],
-                               extra_link_args=['-lgomp'] if openmp_enabled else [])
+                               extra_link_args=['-lgomp'] if openmp_enabled else []),
                     ],
     )

From fdf2aca34431e88b0913f7e8e957765f5e4b6855 Mon Sep 17 00:00:00 2001
From: Liam Marsh <liam.marsh@epfl.ch>
Date: Fri, 14 Nov 2025 21:11:01 +0100
Subject: [PATCH 4/7] fix: Ksenia's review comments

---
 qstack/regression/local_kernels.py | 21 ++++++---------------
 tests/test_kernels.py              | 30 ++++++++++++++++++------------
 2 files changed, 24 insertions(+), 27 deletions(-)

diff --git a/qstack/regression/local_kernels.py b/qstack/regression/local_kernels.py
index 1a0fbbd0..2a081c76 100644
--- a/qstack/regression/local_kernels.py
+++ b/qstack/regression/local_kernels.py
@@ -18,7 +18,11 @@ def compute_distance_matrix(R1,R2):
     """Compute the manhattan-distance matrix.
 
     This computes (||r_1 - r_2||_1) between the samples of R1 and R2,
-    using a batched python/numpy implementation.
+    using a batched python/numpy implementation,
+    designed to be more memory-efficient than a single numpy call and faster than a simple python for loop.
+
+    This function is a batched-over-R1 implementation of the following code:
+    `return np.sum( (R1[:,None, ...]-R2[None,:, ...])**2, axis=tuple(range(2, R1.ndim)))`
 
     Args:
         R1 (numpy ndarray): First set of samples (can be multi-dimensional).
@@ -32,10 +36,7 @@ def compute_distance_matrix(R1,R2):
     """
     if R1.ndim != R2.ndim or R1.shape[1:] != R2.shape[1:]:
         raise RuntimeError(f'incompatible shapes for R1 ({R1.shape:r}) and R2 ({R2.shape:r})')
-    # this function is a memory-efficient implementation of the following code
-    # R1v = R1.reshape(R1.shape[0], -1)
-    # R2v = R2.reshape(R2.shape[0], -1)
-    # return np.sum( (R1v[:,None]-R2v[None,:])**2 , axis=2)
+
 
     # determine batch size (batch should divide the larger dimention)
     if R1.shape[0] < R2.shape[0]:
@@ -92,16 +93,6 @@ def custom_laplacian_kernel(X, Y, gamma):
     """
     if X.shape[1:] != Y.shape[1:]:
         raise RuntimeError(f"Incompatible shapes {X.shape} and {Y.shape}")
-    def cdist(X, Y):
-        K = np.zeros((len(X),len(Y)))
-        for i,x in enumerate(X):
-            x = np.array([x] * len(Y))
-            d = np.abs(x-Y)
-            d = np.sum(d, axis=tuple(range(1, len(d.shape))))
-            K[i,:] = d
-        return K
-    # if not np.allclose(compute_distance_matrix(X,Y), cdist(X, Y)):
-    #     import pdb; pdb.set_trace()
     K = -gamma * compute_distance_matrix(X,Y)
     np.exp(K, out=K)
     return K
diff --git a/tests/test_kernels.py b/tests/test_kernels.py
index 82133fc8..7e6d62b2 100755
--- a/tests/test_kernels.py
+++ b/tests/test_kernels.py
@@ -26,13 +26,6 @@ def test_local_kernels():
     K_dot_good = np.array([[1.1760054, 1.48883663], [0.22067605, 0.35151164]])
     K_cos_good = np.array([[0.85579088, 0.91287375], [0.22883447, 0.30712225]])
 
-    X_huge = np.concatenate([X]*1_000, axis=1)
-    X_huge = np.concatenate([X_huge]*1000, axis=0)
-    Y_huge = np.concatenate([Y]*1_000, axis=1)
-    Y_huge = np.concatenate([Y_huge]*50, axis=0)
-    K_L_good_huge = np.concatenate([K_L_good]*1000, axis=0)
-    K_L_good_huge = np.concatenate([K_L_good_huge]*50, axis=1)
-
     for akernel in ['G', 'G_sklearn', 'G_custom_c']:
         K = kernel.kernel(X, Y, akernel=akernel, sigma=2.0)
         assert np.allclose(K, K_G_good)
@@ -41,11 +34,6 @@ def test_local_kernels():
         K = kernel.kernel(X, Y, akernel=akernel, sigma=2.0)
         assert np.allclose(K, K_L_good)
 
-    local_kernels.RAM_BATCHING_SIZE = 1024**2 * 50  # 50MiB
-    for akernel in ['L_custom_c', 'L_custom_py', 'L', 'L_sklearn']:
-        K = kernel.kernel(X_huge, Y_huge, akernel=akernel, sigma=2.0*1000)
-        assert np.allclose(K, K_L_good_huge)
-
     K = kernel.kernel(X.reshape((2,2,2)), Y.reshape((2,2,2)), akernel='L_custom_py', sigma=2.0)
     assert np.allclose(K, K_L_good)
 
@@ -56,5 +44,23 @@ def test_local_kernels():
     assert np.allclose(K, K_cos_good)
 
 
+def test_batched_local_kernels():
+    X = np.array([[0.70043712, 0.84418664, 0.67651434, 0.72785806], [0.95145796, 0.0127032 , 0.4135877 , 0.04881279]])
+    Y = np.array([[0.09992856, 0.50806631, 0.20024754, 0.74415417], [0.192892  , 0.70084475, 0.29322811, 0.77447945]])
+    K_L_good = np.array([[0.48938983, 0.58251676], [0.32374891, 0.31778924]])
+
+    X_huge = np.concatenate([X]*1_000, axis=1)
+    X_huge = np.concatenate([X_huge]*1000, axis=0)
+    Y_huge = np.concatenate([Y]*1_000, axis=1)
+    Y_huge = np.concatenate([Y_huge]*50, axis=0)
+    K_L_good_huge = np.concatenate([K_L_good]*1000, axis=0)
+    K_L_good_huge = np.concatenate([K_L_good_huge]*50, axis=1)
+
+    local_kernels.RAM_BATCHING_SIZE = 1024**2 * 50  # 50MiB
+    for akernel in ['L_custom_c', 'L_custom_py', 'L', 'L_sklearn']:
+        K = kernel.kernel(X_huge, Y_huge, akernel=akernel, sigma=2.0*1000)
+        assert np.allclose(K, K_L_good_huge)
+
 if __name__ == '__main__':
     test_local_kernels()
+    test_batched_local_kernels()

From 73b39afd4c9564a4d71c936dffa0378b500a8808 Mon Sep 17 00:00:00 2001
From: Ksenia <briling@mail.ru>
Date: Fri, 14 Nov 2025 23:54:30 +0100
Subject: [PATCH 5/7] Formatting

---
 qstack/regression/local_kernels.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/qstack/regression/local_kernels.py b/qstack/regression/local_kernels.py
index 2a081c76..ac386609 100644
--- a/qstack/regression/local_kernels.py
+++ b/qstack/regression/local_kernels.py
@@ -2,6 +2,9 @@
 
 Provides:
     local_kernels_dict: Dictionary mapping kernel names to their implementations.
+    RAM_BATCHING_SIZE: Max. RAM (in bytes) that can be used for batched compuation
+        of Manhattan distance matrix for L_custom_py kernel. Can be modified before call
+        using `local_kernels.RAM_BATCHING_SIZE = ...`.
 """
 
 import os
@@ -14,8 +17,10 @@
 
 
 RAM_BATCHING_SIZE = 1024**3 * 5  # 5GiB
-def compute_distance_matrix(R1,R2):
-    """Compute the manhattan-distance matrix.
+
+
+def compute_distance_matrix(R1, R2):
+    """Compute the Manhattan-distance matrix.
 
     This computes (||r_1 - r_2||_1) between the samples of R1 and R2,
     using a batched python/numpy implementation,
@@ -37,7 +42,6 @@ def compute_distance_matrix(R1,R2):
     if R1.ndim != R2.ndim or R1.shape[1:] != R2.shape[1:]:
         raise RuntimeError(f'incompatible shapes for R1 ({R1.shape:r}) and R2 ({R2.shape:r})')
 
-
     # determine batch size (batch should divide the larger dimention)
     if R1.shape[0] < R2.shape[0]:
         transpose_flag = True
@@ -55,7 +59,7 @@ def compute_distance_matrix(R1,R2):
 
     if min(R1.shape[0],R2.shape[0]) == 0 or batch_size >= R1.shape[0]:
         dists = R1[:,None]-R2[None,:]
-        #np.pow(dists, 2, out=dists)
+        #np.pow(dists, 2, out=dists)  # For Euclidean distance
         np.abs(dists, out=dists)
         out = np.sum(dists, axis=tuple(range(2,dists.ndim)))
     else:
@@ -75,6 +79,7 @@ def compute_distance_matrix(R1,R2):
         out = out.T
     return out
 
+
 def custom_laplacian_kernel(X, Y, gamma):
     """Compute Laplacian kernel between X and Y using Python implementation.
 

From f3fec8509c0616626078c6db238ed668907706a7 Mon Sep 17 00:00:00 2001
From: Ksenia <briling@mail.ru>
Date: Sat, 15 Nov 2025 00:51:09 +0100
Subject: [PATCH 6/7] Refactor

---
 qstack/regression/local_kernels.py | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/qstack/regression/local_kernels.py b/qstack/regression/local_kernels.py
index ac386609..f1d746d8 100644
--- a/qstack/regression/local_kernels.py
+++ b/qstack/regression/local_kernels.py
@@ -11,6 +11,7 @@
 import ctypes
 import sysconfig
 import warnings
+import itertools
 import numpy as np
 import sklearn.metrics.pairwise as _SKLEARN_PAIRWISE
 from qstack.regression import __path__ as REGMODULE_PATH
@@ -61,19 +62,17 @@ def compute_distance_matrix(R1, R2):
         dists = R1[:,None]-R2[None,:]
         #np.pow(dists, 2, out=dists)  # For Euclidean distance
         np.abs(dists, out=dists)
-        out = np.sum(dists, axis=tuple(range(2,dists.ndim)))
+        np.sum(dists, out=out, axis=tuple(range(2,dists.ndim)))
     else:
-        n_batches = int(np.ceil(R1.shape[0] / batch_size))
-        dists = np.zeros( (batch_size, *R2.shape), dtype=dtype )
-        for batch_i in range(n_batches):
-            batch_start = batch_i*batch_size
-            this_batch_size = min(batch_size, R1.shape[0]-batch_start)
-
-            R1_view = R1[batch_start : batch_start + this_batch_size, None, ...]
-            np.subtract(R1_view, R2[None,:], out=dists[:this_batch_size])
-            #np.pow(dists[:this_batch_size], 2, out=dists[:this_batch_size])  # For Euclidean distance
-            np.abs(dists[:this_batch_size], out=dists[:this_batch_size])
-            np.sum(dists[:this_batch_size], out=out[batch_start : batch_start+this_batch_size], axis=tuple(range(2,dists.ndim)))
+        dists = np.zeros((batch_size, *R2.shape), dtype=dtype)
+        batch_limits = np.minimum(np.arange(0, R1.shape[0]+batch_size, step=batch_size), R1.shape[0])
+        for batch_start, batch_end in itertools.pairwise(batch_limits):
+            dists_view = dists[:batch_end-batch_start]
+            R1_view = R1[batch_start:batch_end, None, ...]
+            np.subtract(R1_view, R2[None,:], out=dists_view)
+            #np.pow(dists_view, 2, out=dists_view)  # For Euclidean distance
+            np.abs(dists_view, out=dists_view)
+            np.sum(dists_view, out=out[batch_start:batch_end], axis=tuple(range(2,dists.ndim)))
 
     if transpose_flag:
         out = out.T

From aea97e16fb3ed09b5ef58dd40cc7f62b3cdb6795 Mon Sep 17 00:00:00 2001
From: Ksenia <briling@mail.ru>
Date: Sat, 15 Nov 2025 01:15:03 +0100
Subject: [PATCH 7/7] Refactor test

---
 tests/test_kernels.py | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/tests/test_kernels.py b/tests/test_kernels.py
index 7e6d62b2..397fe974 100755
--- a/tests/test_kernels.py
+++ b/tests/test_kernels.py
@@ -49,17 +49,18 @@ def test_batched_local_kernels():
     Y = np.array([[0.09992856, 0.50806631, 0.20024754, 0.74415417], [0.192892  , 0.70084475, 0.29322811, 0.77447945]])
     K_L_good = np.array([[0.48938983, 0.58251676], [0.32374891, 0.31778924]])
 
-    X_huge = np.concatenate([X]*1_000, axis=1)
-    X_huge = np.concatenate([X_huge]*1000, axis=0)
-    Y_huge = np.concatenate([Y]*1_000, axis=1)
-    Y_huge = np.concatenate([Y_huge]*50, axis=0)
-    K_L_good_huge = np.concatenate([K_L_good]*1000, axis=0)
-    K_L_good_huge = np.concatenate([K_L_good_huge]*50, axis=1)
+    X_huge = np.tile(X, (1000,1000))
+    Y_huge = np.tile(Y, (50,1000))
+    K_L_good_huge = np.tile(K_L_good, (1000,50))
 
     local_kernels.RAM_BATCHING_SIZE = 1024**2 * 50  # 50MiB
-    for akernel in ['L_custom_c', 'L_custom_py', 'L', 'L_sklearn']:
-        K = kernel.kernel(X_huge, Y_huge, akernel=akernel, sigma=2.0*1000)
-        assert np.allclose(K, K_L_good_huge)
+
+    K = kernel.kernel(X_huge, Y_huge, akernel='L_custom_py', sigma=2.0*1000)
+    assert np.allclose(K, K_L_good_huge)
+
+    K = kernel.kernel(X_huge.reshape((-1, 50, 80)), Y_huge.reshape((-1, 50, 80)), akernel='L_custom_py', sigma=2.0*1000)
+    assert np.allclose(K, K_L_good_huge)
+
 
 if __name__ == '__main__':
     test_local_kernels()