diff --git a/qstack/regression/local_kernels.py b/qstack/regression/local_kernels.py
index bb076b2..f1d746d 100644
--- a/qstack/regression/local_kernels.py
+++ b/qstack/regression/local_kernels.py
@@ -2,17 +2,83 @@
 
 Provides:
     local_kernels_dict: Dictionary mapping kernel names to their implementations.
+    RAM_BATCHING_SIZE: Max. RAM (in bytes) that can be used for batched compuation
+        of Manhattan distance matrix for L_custom_py kernel. Can be modified before call
+        using `local_kernels.RAM_BATCHING_SIZE = ...`.
 """
 
 import os
 import ctypes
 import sysconfig
 import warnings
+import itertools
 import numpy as np
 import sklearn.metrics.pairwise as _SKLEARN_PAIRWISE
 from qstack.regression import __path__ as REGMODULE_PATH
 
 
+RAM_BATCHING_SIZE = 1024**3 * 5  # 5GiB
+
+
+def compute_distance_matrix(R1, R2):
+    """Compute the Manhattan-distance matrix.
+
+    This computes (||r_1 - r_2||_1) between the samples of R1 and R2,
+    using a batched python/numpy implementation,
+    designed to be more memory-efficient than a single numpy call and faster than a simple python for loop.
+
+    This function is a batched-over-R1 implementation of the following code:
+    `return np.sum( (R1[:,None, ...]-R2[None,:, ...])**2, axis=tuple(range(2, R1.ndim)))`
+
+    Args:
+        R1 (numpy ndarray): First set of samples (can be multi-dimensional).
+        R2 (numpy ndarray): Second set of samples.
+
+    Returns:
+        numpy ndarray: squared-distance matrix of shape (len(R1), len(R2)).
+
+    Raises:
+        RuntimeError: If X and Y have incompatible shapes.
+    """
+    if R1.ndim != R2.ndim or R1.shape[1:] != R2.shape[1:]:
+        raise RuntimeError(f'incompatible shapes for R1 ({R1.shape:r}) and R2 ({R2.shape:r})')
+
+    # determine batch size (batch should divide the larger dimention)
+    if R1.shape[0] < R2.shape[0]:
+        transpose_flag = True
+        R2,R1 = R1,R2
+    else:
+        transpose_flag = False
+    dtype=np.result_type(R1,R2)
+    out = np.zeros((R1.shape[0], R2.shape[0]), dtype=dtype)
+
+    # possible weirdness: how is the layout of dtype done if dtype.alignment != dtype.itemsize?
+    batch_size = int(np.floor(RAM_BATCHING_SIZE/ (dtype.itemsize * np.prod(R2.shape))))
+
+    if batch_size == 0:
+        batch_size = 1
+
+    if min(R1.shape[0],R2.shape[0]) == 0 or batch_size >= R1.shape[0]:
+        dists = R1[:,None]-R2[None,:]
+        #np.pow(dists, 2, out=dists)  # For Euclidean distance
+        np.abs(dists, out=dists)
+        np.sum(dists, out=out, axis=tuple(range(2,dists.ndim)))
+    else:
+        dists = np.zeros((batch_size, *R2.shape), dtype=dtype)
+        batch_limits = np.minimum(np.arange(0, R1.shape[0]+batch_size, step=batch_size), R1.shape[0])
+        for batch_start, batch_end in itertools.pairwise(batch_limits):
+            dists_view = dists[:batch_end-batch_start]
+            R1_view = R1[batch_start:batch_end, None, ...]
+            np.subtract(R1_view, R2[None,:], out=dists_view)
+            #np.pow(dists_view, 2, out=dists_view)  # For Euclidean distance
+            np.abs(dists_view, out=dists_view)
+            np.sum(dists_view, out=out[batch_start:batch_end], axis=tuple(range(2,dists.ndim)))
+
+    if transpose_flag:
+        out = out.T
+    return out
+
+
 def custom_laplacian_kernel(X, Y, gamma):
     """Compute Laplacian kernel between X and Y using Python implementation.
 
@@ -31,15 +97,7 @@ def custom_laplacian_kernel(X, Y, gamma):
     """
     if X.shape[1:] != Y.shape[1:]:
         raise RuntimeError(f"Incompatible shapes {X.shape} and {Y.shape}")
-    def cdist(X, Y):
-        K = np.zeros((len(X),len(Y)))
-        for i,x in enumerate(X):
-            x = np.array([x] * len(Y))
-            d = np.abs(x-Y)
-            d = np.sum(d, axis=tuple(range(1, len(d.shape))))
-            K[i,:] = d
-        return K
-    K = -gamma * cdist(X, Y)
+    K = -gamma * compute_distance_matrix(X,Y)
     np.exp(K, out=K)
     return K
 
diff --git a/setup.py b/setup.py
index 5f97a4b..0d090ec 100644
--- a/setup.py
+++ b/setup.py
@@ -1,15 +1,22 @@
-from setuptools import setup, find_packages, Extension
+from setuptools import setup, Extension
 import subprocess
-import os, tempfile, shutil
+import os
+import tempfile
+import shutil
+
+# ruff: noqa: S607  # look, if people are installing qstack while using a borked PATH, that's on them
+# ruff: noqa: D100  # this is a setup.py file, no docstring is needed
 
 VERSION="0.0.1"
 
 def get_git_version_hash():
     """Get tag/hash of the latest commit.
-    Thanks to https://gist.github.com/nloadholtes/07a1716a89b53119021592a1d2b56db8"""
+
+    Thanks to https://gist.github.com/nloadholtes/07a1716a89b53119021592a1d2b56db8
+    """
     try:
         p = subprocess.Popen(["git", "describe", "--tags", "--dirty", "--always"], stdout=subprocess.PIPE)
-    except EnvironmentError:
+    except OSError:
         return VERSION + "+unknown"
     version = p.communicate()[0]
     print(version)
@@ -17,8 +24,10 @@ def get_git_version_hash():
 
 
 def check_for_openmp():
-    """Check if there is OpenMP available
-    Thanks to https://stackoverflow.com/questions/16549893/programatically-testing-for-openmp-support-from-a-python-setup-script"""
+    """Check if there is OpenMP available.
+
+    Thanks to https://stackoverflow.com/questions/16549893/programatically-testing-for-openmp-support-from-a-python-setup-script
+    """
     omp_test = 'void main() { }'
     tmpdir = tempfile.mkdtemp()
     curdir = os.getcwd()
@@ -27,7 +36,7 @@ def check_for_openmp():
     with open(filename, 'w') as file:
         file.write(omp_test)
     with open(os.devnull, 'w') as fnull:
-        result = subprocess.call(['cc', '-fopenmp', '-lgomp', filename], stdout=fnull, stderr=fnull)
+        result = subprocess.call(['cc', '-fopenmp', '-lgomp', filename], stdout=fnull, stderr=fnull)  # noqa: S603  (filename is hard-coded earlier in this function)
     os.chdir(curdir)
     shutil.rmtree(tmpdir)
     return not result
@@ -41,6 +50,6 @@ def check_for_openmp():
         ext_modules=[Extension('qstack.regression.lib.manh',
                                ['qstack/regression/lib/manh.c'],
                                extra_compile_args=['-fopenmp', '-std=gnu11'] if openmp_enabled else ['-std=gnu11'],
-                               extra_link_args=['-lgomp'] if openmp_enabled else [])
+                               extra_link_args=['-lgomp'] if openmp_enabled else []),
                     ],
     )
diff --git a/tests/test_kernels.py b/tests/test_kernels.py
index 69d546f..397fe97 100755
--- a/tests/test_kernels.py
+++ b/tests/test_kernels.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 
 import numpy as np
-from qstack.regression import kernel
+from qstack.regression import kernel, local_kernels
 
 
 def test_local_kernels():
@@ -44,5 +44,24 @@ def test_local_kernels():
     assert np.allclose(K, K_cos_good)
 
 
+def test_batched_local_kernels():
+    X = np.array([[0.70043712, 0.84418664, 0.67651434, 0.72785806], [0.95145796, 0.0127032 , 0.4135877 , 0.04881279]])
+    Y = np.array([[0.09992856, 0.50806631, 0.20024754, 0.74415417], [0.192892  , 0.70084475, 0.29322811, 0.77447945]])
+    K_L_good = np.array([[0.48938983, 0.58251676], [0.32374891, 0.31778924]])
+
+    X_huge = np.tile(X, (1000,1000))
+    Y_huge = np.tile(Y, (50,1000))
+    K_L_good_huge = np.tile(K_L_good, (1000,50))
+
+    local_kernels.RAM_BATCHING_SIZE = 1024**2 * 50  # 50MiB
+
+    K = kernel.kernel(X_huge, Y_huge, akernel='L_custom_py', sigma=2.0*1000)
+    assert np.allclose(K, K_L_good_huge)
+
+    K = kernel.kernel(X_huge.reshape((-1, 50, 80)), Y_huge.reshape((-1, 50, 80)), akernel='L_custom_py', sigma=2.0*1000)
+    assert np.allclose(K, K_L_good_huge)
+
+
 if __name__ == '__main__':
     test_local_kernels()
+    test_batched_local_kernels()