From 8388c35cb2708173928fe03d63e8e164d54c0e98 Mon Sep 17 00:00:00 2001 From: Aaron Meyer Date: Wed, 28 May 2025 07:08:56 -0700 Subject: [PATCH 1/3] Perform parafac on the GPU --- parafac2/parafac2.py | 55 ++++----------------------------- parafac2/tests/test_parafac2.py | 16 +++------- parafac2/utils.py | 53 +++++++++++++++++++++++++++++-- 3 files changed, 62 insertions(+), 62 deletions(-) diff --git a/parafac2/parafac2.py b/parafac2/parafac2.py index 3089cb4..93a32ca 100644 --- a/parafac2/parafac2.py +++ b/parafac2/parafac2.py @@ -3,16 +3,15 @@ from copy import deepcopy import anndata +import cupy as cp import numpy as np from scipy.sparse.linalg import norm from sklearn.utils.extmath import randomized_svd -from tensorly.cp_tensor import cp_normalize -from tensorly.tenalg.core_tenalg import unfolding_dot_khatri_rao from tqdm import tqdm -from .SECSI import SECSI from .utils import ( anndata_to_list, + parafac, project_data, reconstruction_error, standardize_pf2, @@ -59,54 +58,12 @@ def parafac2_init( return factors, norm_tensor -def parafac( - tensor: np.ndarray, - factors: list[np.ndarray], - n_iter_max: int = 3, -) -> list[np.ndarray]: - """Decomposes a tensor into a set of factor matrices using the PARAFAC2 algorithm. - PARAFAC2 decomposes a tensor into a set of factor matrices, where one factor - matrix can vary across slices in one mode. This function implements the core - PARAFAC2 algorithm using alternating least squares. - Args: - tensor (numpy.ndarray): The tensor to decompose. - factors (list of numpy.ndarray): Initial guess for the factor matrices. - The length of the list must be equal to the number of modes in the tensor. - Each element of the list is a numpy.ndarray of shape (size_mode_i, rank), - where size_mode_i is the size of the tensor along mode i, and rank is - the desired rank of the decomposition. - n_iter_max (int, optional): Maximum number of iterations for the algorithm. - Defaults to 3. - Returns: - list of numpy.ndarray: A list containing the factor matrices. - The order of the factor matrices corresponds to the modes of the input - tensor. The first factor matrix is scaled by the weights. - """ - rank = factors[0].shape[1] - - for _ in range(n_iter_max): - for mode in range(tensor.ndim): - pinv = np.ones((rank, rank)) - for i, factor in enumerate(factors): - if i != mode: - pinv *= factor.T @ factor - - mttkrp = unfolding_dot_khatri_rao(tensor, (None, factors), mode) - factors[mode] = np.linalg.solve(pinv.T, mttkrp.T).T - - weights, factors = cp_normalize((None, factors)) - factors[0] *= weights[None, :] - - return factors - - def parafac2_nd( X_in: anndata.AnnData, rank: int, n_iter_max: int = 100, tol: float = 1e-6, random_state: int | None = None, - SECSI_solver=False, callback: Callable[[int, float, list, list], None] | None = None, ) -> tuple[tuple, float]: r"""The same interface as regular PARAFAC2.""" @@ -133,10 +90,6 @@ def parafac2_nd( err = reconstruction_error(factors, projections, projected_X, norm_tensor) errs = [err] - if SECSI_solver: - SECSerror, factorOuts = SECSI(projected_X, rank, verbose=False) - factors = factorOuts[np.argmin(SECSerror)].factors - print("") tq = tqdm(range(n_iter_max), disable=(not verbose), delay=1.0) for iteration in tq: @@ -171,11 +124,15 @@ def parafac2_nd( factors_old = deepcopy(factors) + factors = [cp.array(f) for f in factors] + factors = parafac( projected_X, factors, ) + factors = [cp.asnumpy(f) for f in factors] + delta = errs[-2] - errs[-1] tq.set_postfix( error=errs[-1], R2X=1.0 - errs[-1], Δ=delta, jump=jump, refresh=False diff --git a/parafac2/tests/test_parafac2.py b/parafac2/tests/test_parafac2.py index 6f09cc7..16677d4 100644 --- a/parafac2/tests/test_parafac2.py +++ b/parafac2/tests/test_parafac2.py @@ -59,9 +59,8 @@ def test_init_reprod(sparse: bool): cp.testing.assert_array_equal(proj1[ii], proj2[ii]) -@pytest.mark.parametrize("SECSI_solver", [False, True]) @pytest.mark.parametrize("sparse", [False, True]) -def test_parafac2(sparse: bool, SECSI_solver: bool): +def test_parafac2(sparse: bool): """Test for equivalence to TensorLy's PARAFAC2.""" pf2shape = [(250, 1000)] * 8 X: list[np.ndarray] = random_parafac2(pf2shape, rank=3, full=True, random_state=2) # type: ignore @@ -71,18 +70,14 @@ def test_parafac2(sparse: bool, SECSI_solver: bool): options = {"tol": 1e-9, "n_iter_max": 200} - (w1, f1, p1), e1 = parafac2_nd( - X_ann, rank=3, random_state=1, SECSI_solver=SECSI_solver, **options - ) + (w1, f1, p1), e1 = parafac2_nd(X_ann, rank=3, random_state=1, **options) # Test that the model still matches the data err = _parafac2_reconstruction_error(X, (w1, f1, p1)) ** 2 np.testing.assert_allclose(1.0 - err / norm_tensor, e1, rtol=1e-5) # Test reproducibility - (w2, f2, p2), e2 = parafac2_nd( - X_ann, rank=3, random_state=3, SECSI_solver=SECSI_solver, **options - ) + (w2, f2, p2), e2 = parafac2_nd(X_ann, rank=3, random_state=3, **options) # Compare to TensorLy wT, fT, pT = parafac2( X, @@ -129,9 +124,8 @@ def test_pf2_r2x(): np.testing.assert_allclose(err, errCMF, rtol=1e-8) -@pytest.mark.parametrize("SECSI_solver", [False, True]) @pytest.mark.parametrize("sparse", [False, True]) -def test_performance(sparse: bool, SECSI_solver: bool): +def test_performance(sparse: bool): """Test for equivalence to TensorLy's PARAFAC2.""" # 5000 by 2000 by 300 is roughly the lupus data pf2shape = [(5_00, 2_00)] * 30 @@ -139,7 +133,7 @@ def test_performance(sparse: bool, SECSI_solver: bool): X = pf2_to_anndata(X, sparse=sparse) - (w1, f1, p1), e1 = parafac2_nd(X, rank=9, SECSI_solver=SECSI_solver) + (w1, f1, p1), e1 = parafac2_nd(X, rank=9) def test_pf2_proj_centering(): diff --git a/parafac2/utils.py b/parafac2/utils.py index 519ecc5..43c8467 100644 --- a/parafac2/utils.py +++ b/parafac2/utils.py @@ -1,9 +1,56 @@ import anndata import cupy as cp import numpy as np +import tensorly as tl from cupyx.scipy import sparse as cupy_sparse from scipy.optimize import linear_sum_assignment from tensorly.cp_tensor import cp_flip_sign, cp_normalize +from tensorly.tenalg.core_tenalg import unfolding_dot_khatri_rao + + +def parafac( + tensor: cp.ndarray, + factors: list[cp.ndarray], + n_iter_max: int = 3, +) -> list[cp.ndarray]: + """Decomposes a tensor into a set of factor matrices using the PARAFAC2 algorithm. + PARAFAC2 decomposes a tensor into a set of factor matrices, where one factor + matrix can vary across slices in one mode. This function implements the core + PARAFAC2 algorithm using alternating least squares. + Args: + tensor (numpy.ndarray): The tensor to decompose. + factors (list of numpy.ndarray): Initial guess for the factor matrices. + The length of the list must be equal to the number of modes in the tensor. + Each element of the list is a numpy.ndarray of shape (size_mode_i, rank), + where size_mode_i is the size of the tensor along mode i, and rank is + the desired rank of the decomposition. + n_iter_max (int, optional): Maximum number of iterations for the algorithm. + Defaults to 3. + Returns: + list of numpy.ndarray: A list containing the factor matrices. + The order of the factor matrices corresponds to the modes of the input + tensor. The first factor matrix is scaled by the weights. + """ + rank = factors[0].shape[1] + + tl.set_backend("cupy") + + for _ in range(n_iter_max): + for mode in range(tensor.ndim): + pinv = cp.ones((rank, rank)) + for i, factor in enumerate(factors): + if i != mode: + pinv *= factor.T @ factor + + mttkrp = unfolding_dot_khatri_rao(tensor, (None, factors), mode) + factors[mode] = cp.linalg.solve(pinv.T, mttkrp.T).T + + weights, factors = cp_normalize((None, factors)) + factors[0] *= weights[None, :] + + tl.set_backend("numpy") + + return factors def anndata_to_list(X_in: anndata.AnnData) -> list[cp.ndarray | cupy_sparse.csr_matrix]: @@ -44,7 +91,7 @@ def project_data( centering = cp.outer(cp.sum(proj, axis=0), means) projected_X[i, :, :] = proj.T @ mat - centering - return projections, cp.asnumpy(projected_X) + return projections, projected_X def reconstruction_error( @@ -63,7 +110,9 @@ def reconstruction_error( B_i = (proj @ B) * A[i] # trace of the multiplication products - norm_sq_err -= 2.0 * np.trace(A[i][:, np.newaxis] * B.T @ projected_X[i] @ C) + norm_sq_err -= 2.0 * np.trace( + A[i][:, np.newaxis] * B.T @ cp.asnumpy(projected_X[i]) @ C + ) norm_sq_err += ((B_i.T @ B_i) * CtC).sum() return norm_sq_err From 5d5629ad21401ea77ddab0cc70f6cb40a2a16234 Mon Sep 17 00:00:00 2001 From: Aaron Meyer Date: Wed, 28 May 2025 08:36:54 -0700 Subject: [PATCH 2/3] Fix deviance --- parafac2/normalize.py | 6 ++++-- parafac2/parafac2.py | 8 +++++++- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/parafac2/normalize.py b/parafac2/normalize.py index eb5f33b..95b432d 100644 --- a/parafac2/normalize.py +++ b/parafac2/normalize.py @@ -2,6 +2,7 @@ import numpy as np from scipy.sparse import csr_array, issparse from scipy.special import xlogy +from sklearn.preprocessing import scale def prepare_dataset( @@ -120,6 +121,7 @@ def get_deviance(data: csr_array) -> np.ndarray: deviance = np.maximum(deviance, 0.0) # Ensure non-negative before sqrt # Calculate signed square root residuals: sign(y - mu) * sqrt(D) - signs = np.sign(y_ij - mu_ij) + residuals = np.sqrt(deviance) * np.sign(y_ij - mu_ij) - return signs * np.sqrt(deviance) + # z-score + return scale(residuals) diff --git a/parafac2/parafac2.py b/parafac2/parafac2.py index 93a32ca..105e449 100644 --- a/parafac2/parafac2.py +++ b/parafac2/parafac2.py @@ -6,6 +6,7 @@ import cupy as cp import numpy as np from scipy.sparse.linalg import norm +from sklearn.utils import resample from sklearn.utils.extmath import randomized_svd from tqdm import tqdm @@ -52,7 +53,12 @@ def parafac2_init( else: norm_tensor = float(norm(X_in.X) ** 2.0 - 2 * np.sum(lmult)) - _, _, C = randomized_svd(X_in.X, rank, random_state=random_state) + if X_in.shape[0] > X_in.shape[1] * 10: + X_svd = resample(X_in.X, n_samples=X_in.shape[1] * 10) + else: + X_svd = X_in.X + + _, _, C = randomized_svd(X_svd, rank, random_state=random_state) # type: ignore factors = [np.ones((n_cond, rank)), np.eye(rank), C.T] return factors, norm_tensor From 45419cddfdaeb9dd36cbe5312274280c8357312d Mon Sep 17 00:00:00 2001 From: Aaron Meyer Date: Fri, 30 May 2025 20:57:28 -0700 Subject: [PATCH 3/3] Adjustments --- parafac2/parafac2.py | 4 +++- pyproject.toml | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/parafac2/parafac2.py b/parafac2/parafac2.py index 105e449..e51d21a 100644 --- a/parafac2/parafac2.py +++ b/parafac2/parafac2.py @@ -54,7 +54,9 @@ def parafac2_init( norm_tensor = float(norm(X_in.X) ** 2.0 - 2 * np.sum(lmult)) if X_in.shape[0] > X_in.shape[1] * 10: - X_svd = resample(X_in.X, n_samples=X_in.shape[1] * 10) + X_svd = resample( + X_in.X, n_samples=X_in.shape[1] * 10, random_state=random_state + ) else: X_svd = X_in.X diff --git a/pyproject.toml b/pyproject.toml index 99f64c1..ee814a9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,7 @@ dependencies = [ managed = true dev-dependencies = [ "pytest>=8.3", - "pytest-cov>=6.0", + "pytest-cov>=6.1", "pyright>=1.1.380", ]