From a780e01bd897a1cc0a1ef5bccafec85a59187056 Mon Sep 17 00:00:00 2001 From: Liam Marsh Date: Fri, 14 Nov 2025 15:22:00 +0100 Subject: [PATCH 1/7] feat: speed up python implementation of laplacian kernel --- qstack/regression/local_kernels.py | 67 +++++++++++++++++++++++++++++- tests/test_kernels.py | 15 ++++++- 2 files changed, 80 insertions(+), 2 deletions(-) diff --git a/qstack/regression/local_kernels.py b/qstack/regression/local_kernels.py index bb076b2f..d9270049 100644 --- a/qstack/regression/local_kernels.py +++ b/qstack/regression/local_kernels.py @@ -13,6 +13,69 @@ from qstack.regression import __path__ as REGMODULE_PATH +RAM_BATCHING_SIZE = 1024**3 * 5 # 5GiB +def compute_distance_matrix(R1,R2): + f"""Computes the manhattan-distance matrix (||r_1 - r_2||_1) between the samples of R1 and R2, + using a batched python/numpy implementation. + + Args: + R1 (numpy ndarray): First set of samples (can be multi-dimensional). + R2 (numpy ndarray): Second set of samples. + + Returns: + numpy ndarray: squared-distance matrix of shape (len(R1), len(R2)). + + Raises: + RuntimeError: If X and Y have incompatible shapes. + """ + if R1.ndim != R2.ndim or R1.shape[1:] != R2.shape[1:]: + raise RuntimeError(f'incompatible shapes for R1 ({R1.shape:r}) and R2 ({R2.shape:r})') + # this function is a memory-efficient implementation of the following code + # R1v = R1.reshape(R1.shape[0], -1) + # R2v = R2.reshape(R2.shape[0], -1) + # return np.sum( (R1v[:,None]-R2v[None,:])**2 , axis=2) + + # determine batch size (batch should divide the larger dimention) + if R1.shape[0] < R2.shape[0]: + transpose_flag = True + R2,R1 = R1,R2 + else: + transpose_flag = False + dtype=np.result_type(R1,R2) + out = np.zeros((R1.shape[0], R2.shape[0]), dtype=dtype) + + # possible weirdness: how is the layout of dtype done if dtype.alignment != dtype.itemsize? + batch_size = int(np.floor(RAM_BATCHING_SIZE/ (dtype.itemsize * np.prod(R2.shape)))) + + if batch_size == 0: + batch_size = 1 + + if min(R1.shape[0],R2.shape[0]) == 0 or batch_size >= R1.shape[0]: + dists = R1[:,None]-R2[None,:] + #np.pow(dists, 2, out=dists) + np.abs(dists, out=dists) + out = np.sum(dists, axis=tuple(range(2,dists.ndim))) + else: + n_batches = int(np.ceil(R1.shape[0] / batch_size)) + dists = np.zeros( (batch_size, *R2.shape), dtype=dtype ) + for batch_i in range(n_batches): + batch_start = batch_i*batch_size + this_batch_size = min(batch_size, R1.shape[0]-batch_start) + + R1_view = R1[batch_start : batch_start + this_batch_size, None, ...] + np.subtract(R1_view, R2[None,:], out=dists[:this_batch_size]) + assert not (dists[:this_batch_size]==0).all() + #np.pow(dists[:this_batch_size], 2, out=dists[:this_batch_size]) # For Euclidean distance + np.abs(dists[:this_batch_size], out=dists[:this_batch_size]) + assert not (dists[:this_batch_size]==0).all() + np.sum(dists[:this_batch_size], out=out[batch_start : batch_start+this_batch_size], axis=tuple(range(2,dists.ndim))) + if batch_i == n_batches-1: + assert batch_start + this_batch_size == R1.shape[0] + + if transpose_flag: + out = out.T + return out + def custom_laplacian_kernel(X, Y, gamma): """Compute Laplacian kernel between X and Y using Python implementation. @@ -39,7 +102,9 @@ def cdist(X, Y): d = np.sum(d, axis=tuple(range(1, len(d.shape)))) K[i,:] = d return K - K = -gamma * cdist(X, Y) + # if not np.allclose(compute_distance_matrix(X,Y), cdist(X, Y)): + # import pdb; pdb.set_trace() + K = -gamma * compute_distance_matrix(X,Y) np.exp(K, out=K) return K diff --git a/tests/test_kernels.py b/tests/test_kernels.py index 69d546f9..845642ae 100755 --- a/tests/test_kernels.py +++ b/tests/test_kernels.py @@ -1,7 +1,8 @@ #!/usr/bin/env python3 +from time import perf_counter import numpy as np -from qstack.regression import kernel +from qstack.regression import kernel, local_kernels def test_local_kernels(): @@ -26,6 +27,13 @@ def test_local_kernels(): K_dot_good = np.array([[1.1760054, 1.48883663], [0.22067605, 0.35151164]]) K_cos_good = np.array([[0.85579088, 0.91287375], [0.22883447, 0.30712225]]) + X_huge = np.concatenate([X]*1_000, axis=1) + X_huge = np.concatenate([X_huge]*1000, axis=0) + Y_huge = np.concatenate([Y]*1_000, axis=1) + Y_huge = np.concatenate([Y_huge]*50, axis=0) + K_L_good_huge = np.concatenate([K_L_good]*1000, axis=0) + K_L_good_huge = np.concatenate([K_L_good_huge]*50, axis=1) + for akernel in ['G', 'G_sklearn', 'G_custom_c']: K = kernel.kernel(X, Y, akernel=akernel, sigma=2.0) assert np.allclose(K, K_G_good) @@ -34,6 +42,11 @@ def test_local_kernels(): K = kernel.kernel(X, Y, akernel=akernel, sigma=2.0) assert np.allclose(K, K_L_good) + local_kernels.RAM_BATCHING_SIZE = 1024**2 * 50 # 50MiB + for akernel in ['L_custom_c', 'L_custom_py', 'L', 'L_sklearn']: + K = kernel.kernel(X_huge, Y_huge, akernel=akernel, sigma=2.0*1000) + assert np.allclose(K, K_L_good_huge) + K = kernel.kernel(X.reshape((2,2,2)), Y.reshape((2,2,2)), akernel='L_custom_py', sigma=2.0) assert np.allclose(K, K_L_good) From ce9f5a612c90adc2f7db64ce32a0ee99c070559e Mon Sep 17 00:00:00 2001 From: Liam Marsh Date: Fri, 14 Nov 2025 20:00:13 +0100 Subject: [PATCH 2/7] fix: make ruff happy --- qstack/regression/local_kernels.py | 8 +++----- tests/test_kernels.py | 1 - 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/qstack/regression/local_kernels.py b/qstack/regression/local_kernels.py index d9270049..1a0fbbd0 100644 --- a/qstack/regression/local_kernels.py +++ b/qstack/regression/local_kernels.py @@ -15,7 +15,9 @@ RAM_BATCHING_SIZE = 1024**3 * 5 # 5GiB def compute_distance_matrix(R1,R2): - f"""Computes the manhattan-distance matrix (||r_1 - r_2||_1) between the samples of R1 and R2, + """Compute the manhattan-distance matrix. + + This computes (||r_1 - r_2||_1) between the samples of R1 and R2, using a batched python/numpy implementation. Args: @@ -64,13 +66,9 @@ def compute_distance_matrix(R1,R2): R1_view = R1[batch_start : batch_start + this_batch_size, None, ...] np.subtract(R1_view, R2[None,:], out=dists[:this_batch_size]) - assert not (dists[:this_batch_size]==0).all() #np.pow(dists[:this_batch_size], 2, out=dists[:this_batch_size]) # For Euclidean distance np.abs(dists[:this_batch_size], out=dists[:this_batch_size]) - assert not (dists[:this_batch_size]==0).all() np.sum(dists[:this_batch_size], out=out[batch_start : batch_start+this_batch_size], axis=tuple(range(2,dists.ndim))) - if batch_i == n_batches-1: - assert batch_start + this_batch_size == R1.shape[0] if transpose_flag: out = out.T diff --git a/tests/test_kernels.py b/tests/test_kernels.py index 845642ae..82133fc8 100755 --- a/tests/test_kernels.py +++ b/tests/test_kernels.py @@ -1,6 +1,5 @@ #!/usr/bin/env python3 -from time import perf_counter import numpy as np from qstack.regression import kernel, local_kernels From 5675d381bf7acdee26909f05fecf28f875951f99 Mon Sep 17 00:00:00 2001 From: Liam Marsh Date: Fri, 14 Nov 2025 20:12:37 +0100 Subject: [PATCH 3/7] fix: make ruff happy about setup.py (also sometimes it's wrong) --- setup.py | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/setup.py b/setup.py index 5f97a4bc..0d090ec4 100644 --- a/setup.py +++ b/setup.py @@ -1,15 +1,22 @@ -from setuptools import setup, find_packages, Extension +from setuptools import setup, Extension import subprocess -import os, tempfile, shutil +import os +import tempfile +import shutil + +# ruff: noqa: S607 # look, if people are installing qstack while using a borked PATH, that's on them +# ruff: noqa: D100 # this is a setup.py file, no docstring is needed VERSION="0.0.1" def get_git_version_hash(): """Get tag/hash of the latest commit. - Thanks to https://gist.github.com/nloadholtes/07a1716a89b53119021592a1d2b56db8""" + + Thanks to https://gist.github.com/nloadholtes/07a1716a89b53119021592a1d2b56db8 + """ try: p = subprocess.Popen(["git", "describe", "--tags", "--dirty", "--always"], stdout=subprocess.PIPE) - except EnvironmentError: + except OSError: return VERSION + "+unknown" version = p.communicate()[0] print(version) @@ -17,8 +24,10 @@ def get_git_version_hash(): def check_for_openmp(): - """Check if there is OpenMP available - Thanks to https://stackoverflow.com/questions/16549893/programatically-testing-for-openmp-support-from-a-python-setup-script""" + """Check if there is OpenMP available. + + Thanks to https://stackoverflow.com/questions/16549893/programatically-testing-for-openmp-support-from-a-python-setup-script + """ omp_test = 'void main() { }' tmpdir = tempfile.mkdtemp() curdir = os.getcwd() @@ -27,7 +36,7 @@ def check_for_openmp(): with open(filename, 'w') as file: file.write(omp_test) with open(os.devnull, 'w') as fnull: - result = subprocess.call(['cc', '-fopenmp', '-lgomp', filename], stdout=fnull, stderr=fnull) + result = subprocess.call(['cc', '-fopenmp', '-lgomp', filename], stdout=fnull, stderr=fnull) # noqa: S603 (filename is hard-coded earlier in this function) os.chdir(curdir) shutil.rmtree(tmpdir) return not result @@ -41,6 +50,6 @@ def check_for_openmp(): ext_modules=[Extension('qstack.regression.lib.manh', ['qstack/regression/lib/manh.c'], extra_compile_args=['-fopenmp', '-std=gnu11'] if openmp_enabled else ['-std=gnu11'], - extra_link_args=['-lgomp'] if openmp_enabled else []) + extra_link_args=['-lgomp'] if openmp_enabled else []), ], ) From fdf2aca34431e88b0913f7e8e957765f5e4b6855 Mon Sep 17 00:00:00 2001 From: Liam Marsh Date: Fri, 14 Nov 2025 21:11:01 +0100 Subject: [PATCH 4/7] fix: Ksenia's review comments --- qstack/regression/local_kernels.py | 21 ++++++--------------- tests/test_kernels.py | 30 ++++++++++++++++++------------ 2 files changed, 24 insertions(+), 27 deletions(-) diff --git a/qstack/regression/local_kernels.py b/qstack/regression/local_kernels.py index 1a0fbbd0..2a081c76 100644 --- a/qstack/regression/local_kernels.py +++ b/qstack/regression/local_kernels.py @@ -18,7 +18,11 @@ def compute_distance_matrix(R1,R2): """Compute the manhattan-distance matrix. This computes (||r_1 - r_2||_1) between the samples of R1 and R2, - using a batched python/numpy implementation. + using a batched python/numpy implementation, + designed to be more memory-efficient than a single numpy call and faster than a simple python for loop. + + This function is a batched-over-R1 implementation of the following code: + `return np.sum( (R1[:,None, ...]-R2[None,:, ...])**2, axis=tuple(range(2, R1.ndim)))` Args: R1 (numpy ndarray): First set of samples (can be multi-dimensional). @@ -32,10 +36,7 @@ def compute_distance_matrix(R1,R2): """ if R1.ndim != R2.ndim or R1.shape[1:] != R2.shape[1:]: raise RuntimeError(f'incompatible shapes for R1 ({R1.shape:r}) and R2 ({R2.shape:r})') - # this function is a memory-efficient implementation of the following code - # R1v = R1.reshape(R1.shape[0], -1) - # R2v = R2.reshape(R2.shape[0], -1) - # return np.sum( (R1v[:,None]-R2v[None,:])**2 , axis=2) + # determine batch size (batch should divide the larger dimention) if R1.shape[0] < R2.shape[0]: @@ -92,16 +93,6 @@ def custom_laplacian_kernel(X, Y, gamma): """ if X.shape[1:] != Y.shape[1:]: raise RuntimeError(f"Incompatible shapes {X.shape} and {Y.shape}") - def cdist(X, Y): - K = np.zeros((len(X),len(Y))) - for i,x in enumerate(X): - x = np.array([x] * len(Y)) - d = np.abs(x-Y) - d = np.sum(d, axis=tuple(range(1, len(d.shape)))) - K[i,:] = d - return K - # if not np.allclose(compute_distance_matrix(X,Y), cdist(X, Y)): - # import pdb; pdb.set_trace() K = -gamma * compute_distance_matrix(X,Y) np.exp(K, out=K) return K diff --git a/tests/test_kernels.py b/tests/test_kernels.py index 82133fc8..7e6d62b2 100755 --- a/tests/test_kernels.py +++ b/tests/test_kernels.py @@ -26,13 +26,6 @@ def test_local_kernels(): K_dot_good = np.array([[1.1760054, 1.48883663], [0.22067605, 0.35151164]]) K_cos_good = np.array([[0.85579088, 0.91287375], [0.22883447, 0.30712225]]) - X_huge = np.concatenate([X]*1_000, axis=1) - X_huge = np.concatenate([X_huge]*1000, axis=0) - Y_huge = np.concatenate([Y]*1_000, axis=1) - Y_huge = np.concatenate([Y_huge]*50, axis=0) - K_L_good_huge = np.concatenate([K_L_good]*1000, axis=0) - K_L_good_huge = np.concatenate([K_L_good_huge]*50, axis=1) - for akernel in ['G', 'G_sklearn', 'G_custom_c']: K = kernel.kernel(X, Y, akernel=akernel, sigma=2.0) assert np.allclose(K, K_G_good) @@ -41,11 +34,6 @@ def test_local_kernels(): K = kernel.kernel(X, Y, akernel=akernel, sigma=2.0) assert np.allclose(K, K_L_good) - local_kernels.RAM_BATCHING_SIZE = 1024**2 * 50 # 50MiB - for akernel in ['L_custom_c', 'L_custom_py', 'L', 'L_sklearn']: - K = kernel.kernel(X_huge, Y_huge, akernel=akernel, sigma=2.0*1000) - assert np.allclose(K, K_L_good_huge) - K = kernel.kernel(X.reshape((2,2,2)), Y.reshape((2,2,2)), akernel='L_custom_py', sigma=2.0) assert np.allclose(K, K_L_good) @@ -56,5 +44,23 @@ def test_local_kernels(): assert np.allclose(K, K_cos_good) +def test_batched_local_kernels(): + X = np.array([[0.70043712, 0.84418664, 0.67651434, 0.72785806], [0.95145796, 0.0127032 , 0.4135877 , 0.04881279]]) + Y = np.array([[0.09992856, 0.50806631, 0.20024754, 0.74415417], [0.192892 , 0.70084475, 0.29322811, 0.77447945]]) + K_L_good = np.array([[0.48938983, 0.58251676], [0.32374891, 0.31778924]]) + + X_huge = np.concatenate([X]*1_000, axis=1) + X_huge = np.concatenate([X_huge]*1000, axis=0) + Y_huge = np.concatenate([Y]*1_000, axis=1) + Y_huge = np.concatenate([Y_huge]*50, axis=0) + K_L_good_huge = np.concatenate([K_L_good]*1000, axis=0) + K_L_good_huge = np.concatenate([K_L_good_huge]*50, axis=1) + + local_kernels.RAM_BATCHING_SIZE = 1024**2 * 50 # 50MiB + for akernel in ['L_custom_c', 'L_custom_py', 'L', 'L_sklearn']: + K = kernel.kernel(X_huge, Y_huge, akernel=akernel, sigma=2.0*1000) + assert np.allclose(K, K_L_good_huge) + if __name__ == '__main__': test_local_kernels() + test_batched_local_kernels() From 73b39afd4c9564a4d71c936dffa0378b500a8808 Mon Sep 17 00:00:00 2001 From: Ksenia Date: Fri, 14 Nov 2025 23:54:30 +0100 Subject: [PATCH 5/7] Formatting --- qstack/regression/local_kernels.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/qstack/regression/local_kernels.py b/qstack/regression/local_kernels.py index 2a081c76..ac386609 100644 --- a/qstack/regression/local_kernels.py +++ b/qstack/regression/local_kernels.py @@ -2,6 +2,9 @@ Provides: local_kernels_dict: Dictionary mapping kernel names to their implementations. + RAM_BATCHING_SIZE: Max. RAM (in bytes) that can be used for batched compuation + of Manhattan distance matrix for L_custom_py kernel. Can be modified before call + using `local_kernels.RAM_BATCHING_SIZE = ...`. """ import os @@ -14,8 +17,10 @@ RAM_BATCHING_SIZE = 1024**3 * 5 # 5GiB -def compute_distance_matrix(R1,R2): - """Compute the manhattan-distance matrix. + + +def compute_distance_matrix(R1, R2): + """Compute the Manhattan-distance matrix. This computes (||r_1 - r_2||_1) between the samples of R1 and R2, using a batched python/numpy implementation, @@ -37,7 +42,6 @@ def compute_distance_matrix(R1,R2): if R1.ndim != R2.ndim or R1.shape[1:] != R2.shape[1:]: raise RuntimeError(f'incompatible shapes for R1 ({R1.shape:r}) and R2 ({R2.shape:r})') - # determine batch size (batch should divide the larger dimention) if R1.shape[0] < R2.shape[0]: transpose_flag = True @@ -55,7 +59,7 @@ def compute_distance_matrix(R1,R2): if min(R1.shape[0],R2.shape[0]) == 0 or batch_size >= R1.shape[0]: dists = R1[:,None]-R2[None,:] - #np.pow(dists, 2, out=dists) + #np.pow(dists, 2, out=dists) # For Euclidean distance np.abs(dists, out=dists) out = np.sum(dists, axis=tuple(range(2,dists.ndim))) else: @@ -75,6 +79,7 @@ def compute_distance_matrix(R1,R2): out = out.T return out + def custom_laplacian_kernel(X, Y, gamma): """Compute Laplacian kernel between X and Y using Python implementation. From f3fec8509c0616626078c6db238ed668907706a7 Mon Sep 17 00:00:00 2001 From: Ksenia Date: Sat, 15 Nov 2025 00:51:09 +0100 Subject: [PATCH 6/7] Refactor --- qstack/regression/local_kernels.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/qstack/regression/local_kernels.py b/qstack/regression/local_kernels.py index ac386609..f1d746d8 100644 --- a/qstack/regression/local_kernels.py +++ b/qstack/regression/local_kernels.py @@ -11,6 +11,7 @@ import ctypes import sysconfig import warnings +import itertools import numpy as np import sklearn.metrics.pairwise as _SKLEARN_PAIRWISE from qstack.regression import __path__ as REGMODULE_PATH @@ -61,19 +62,17 @@ def compute_distance_matrix(R1, R2): dists = R1[:,None]-R2[None,:] #np.pow(dists, 2, out=dists) # For Euclidean distance np.abs(dists, out=dists) - out = np.sum(dists, axis=tuple(range(2,dists.ndim))) + np.sum(dists, out=out, axis=tuple(range(2,dists.ndim))) else: - n_batches = int(np.ceil(R1.shape[0] / batch_size)) - dists = np.zeros( (batch_size, *R2.shape), dtype=dtype ) - for batch_i in range(n_batches): - batch_start = batch_i*batch_size - this_batch_size = min(batch_size, R1.shape[0]-batch_start) - - R1_view = R1[batch_start : batch_start + this_batch_size, None, ...] - np.subtract(R1_view, R2[None,:], out=dists[:this_batch_size]) - #np.pow(dists[:this_batch_size], 2, out=dists[:this_batch_size]) # For Euclidean distance - np.abs(dists[:this_batch_size], out=dists[:this_batch_size]) - np.sum(dists[:this_batch_size], out=out[batch_start : batch_start+this_batch_size], axis=tuple(range(2,dists.ndim))) + dists = np.zeros((batch_size, *R2.shape), dtype=dtype) + batch_limits = np.minimum(np.arange(0, R1.shape[0]+batch_size, step=batch_size), R1.shape[0]) + for batch_start, batch_end in itertools.pairwise(batch_limits): + dists_view = dists[:batch_end-batch_start] + R1_view = R1[batch_start:batch_end, None, ...] + np.subtract(R1_view, R2[None,:], out=dists_view) + #np.pow(dists_view, 2, out=dists_view) # For Euclidean distance + np.abs(dists_view, out=dists_view) + np.sum(dists_view, out=out[batch_start:batch_end], axis=tuple(range(2,dists.ndim))) if transpose_flag: out = out.T From aea97e16fb3ed09b5ef58dd40cc7f62b3cdb6795 Mon Sep 17 00:00:00 2001 From: Ksenia Date: Sat, 15 Nov 2025 01:15:03 +0100 Subject: [PATCH 7/7] Refactor test --- tests/test_kernels.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/tests/test_kernels.py b/tests/test_kernels.py index 7e6d62b2..397fe974 100755 --- a/tests/test_kernels.py +++ b/tests/test_kernels.py @@ -49,17 +49,18 @@ def test_batched_local_kernels(): Y = np.array([[0.09992856, 0.50806631, 0.20024754, 0.74415417], [0.192892 , 0.70084475, 0.29322811, 0.77447945]]) K_L_good = np.array([[0.48938983, 0.58251676], [0.32374891, 0.31778924]]) - X_huge = np.concatenate([X]*1_000, axis=1) - X_huge = np.concatenate([X_huge]*1000, axis=0) - Y_huge = np.concatenate([Y]*1_000, axis=1) - Y_huge = np.concatenate([Y_huge]*50, axis=0) - K_L_good_huge = np.concatenate([K_L_good]*1000, axis=0) - K_L_good_huge = np.concatenate([K_L_good_huge]*50, axis=1) + X_huge = np.tile(X, (1000,1000)) + Y_huge = np.tile(Y, (50,1000)) + K_L_good_huge = np.tile(K_L_good, (1000,50)) local_kernels.RAM_BATCHING_SIZE = 1024**2 * 50 # 50MiB - for akernel in ['L_custom_c', 'L_custom_py', 'L', 'L_sklearn']: - K = kernel.kernel(X_huge, Y_huge, akernel=akernel, sigma=2.0*1000) - assert np.allclose(K, K_L_good_huge) + + K = kernel.kernel(X_huge, Y_huge, akernel='L_custom_py', sigma=2.0*1000) + assert np.allclose(K, K_L_good_huge) + + K = kernel.kernel(X_huge.reshape((-1, 50, 80)), Y_huge.reshape((-1, 50, 80)), akernel='L_custom_py', sigma=2.0*1000) + assert np.allclose(K, K_L_good_huge) + if __name__ == '__main__': test_local_kernels()