diff --git a/CHANGELOG.md b/CHANGELOG.md index 1dd49bc..5e9a3ae 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,9 +1,10 @@ # Changelog -## Version 0.3.0 - 0.3.2 +## Version 0.3.0 - 0.3.3 - Support for string dimensions when creating cellarr arrays. - Support query conditions for slice operations. +- Support sparse writes on dense arrays. - Added unique dim values. Only supported for sparse arrays. - Fix a minor bug causing memory leaks on large sparse arrays. - Fix an issue when domain is max dimension. diff --git a/src/cellarr_array/core/dense.py b/src/cellarr_array/core/dense.py index bb6a98d..a9c2f05 100644 --- a/src/cellarr_array/core/dense.py +++ b/src/cellarr_array/core/dense.py @@ -6,6 +6,7 @@ from typing import List, Tuple, Union import numpy as np +from scipy import sparse as sp from .base import CellArray from .helpers import SliceHelper @@ -69,12 +70,16 @@ def _multi_index(self, key: Tuple[Union[slice, List[int]], ...]) -> np.ndarray: res = array.multi_index[tuple(tiledb_key)] return res[self._attr] if self._attr is not None else res - def write_batch(self, data: np.ndarray, start_row: int, **kwargs) -> None: + def write_batch(self, data: Union[np.ndarray, sp.spmatrix], start_row: int, **kwargs) -> None: """Write a batch of data to the dense array. + This method supports both dense (numpy.ndarray) and sparse + (scipy.sparse.spmatrix) inputs. + Args: data: - Numpy array to write. + Numpy array (for dense write) or Scipy sparse matrix + (for sparse write) to write. start_row: Starting row index for writing. @@ -83,14 +88,9 @@ def write_batch(self, data: np.ndarray, start_row: int, **kwargs) -> None: Additional arguments passed to TileDB write operation. Raises: - TypeError: If input is not a numpy array. + TypeError: If input is not a numpy array or sparse matrix. ValueError: If dimensions don't match or bounds are exceeded. """ - if not isinstance(data, np.ndarray): - raise TypeError("Input must be a numpy array.") - - if len(data.shape) != self.ndim: - raise ValueError(f"Data dimensions {data.shape} don't match array dimensions {self.shape}.") end_row = start_row + data.shape[0] if end_row > self.shape[0]: @@ -100,13 +100,42 @@ def write_batch(self, data: np.ndarray, start_row: int, **kwargs) -> None: if self.ndim == 2 and data.shape[1] != self.shape[1]: raise ValueError(f"Data columns {data.shape[1]} don't match array columns {self.shape[1]}.") + elif self.ndim == 1 and data.ndim > 1 and data.shape[1] != 1: + raise ValueError(f"1D array expects (N, 1) matrix, got {data.shape}") - if self.ndim == 1: - write_region = slice(start_row, end_row) - else: # 2D - write_region = (slice(start_row, end_row), slice(0, self.shape[1])) + if isinstance(data, np.ndarray): + if len(data.shape) != self.ndim: + raise ValueError(f"Data dimensions {data.shape} don't match array dimensions {self.shape}.") - # write_data = {self._attr: data} if len(self.attr_names) > 1 else data - with self.open_array(mode="w") as array: - print("write_region", write_region) - array[write_region] = data + if self.ndim == 1: + write_region = slice(start_row, end_row) + else: + write_region = (slice(start_row, end_row), slice(0, self.shape[1])) + + with self.open_array(mode="w") as array: + array[write_region] = data + + elif sp.issparse(data): + coo_data = data.tocoo() if not isinstance(data, sp.coo_matrix) else data + is_1d = self.ndim == 1 + if is_1d: + if coo_data.shape[0] == 1: # Convert (1,N) to (N,1) + coo_data = sp.coo_matrix( + (coo_data.data, (coo_data.col, np.zeros_like(coo_data.col))), shape=(coo_data.shape[1], 1) + ) + elif coo_data.shape[1] != 1: + raise ValueError(f"1D array expects (N, 1) matrix, got {coo_data.shape}") + + with self.open_array(mode="w") as array: + if is_1d: + for r, val in zip(coo_data.row, coo_data.data): + # row_idx = r + start_row + array[r : r + 1] = val + else: + for r, c, val in zip(coo_data.row, coo_data.col, coo_data.data): + row_idx = r + start_row + col_idx = c + array[row_idx : row_idx + 1, col_idx : col_idx + 1] = val + + else: + raise TypeError("Input must be a numpy array or a scipy sparse matrix.") diff --git a/tests/test_dense.py b/tests/test_dense.py index e9c838a..ba10e4c 100644 --- a/tests/test_dense.py +++ b/tests/test_dense.py @@ -2,6 +2,7 @@ import numpy as np import pytest +import scipy.sparse as sp import tiledb from cellarr_array import DenseCellArray, create_cellarray @@ -65,6 +66,65 @@ def test_2d_bounds_check(sample_dense_array_2d): sample_dense_array_2d.write_batch(data, start_row=0) +def test_1d_sparse_write_batch(sample_dense_array_1d): + data_points = np.array([1.1, 2.2, 3.3], dtype=np.float32) + indices = np.array([2, 5, 8]) + sparse_data = sp.csr_matrix((data_points, (indices, np.zeros(3))), shape=(10, 1)) + + sample_dense_array_1d.write_batch(sparse_data, start_row=0) + result = sample_dense_array_1d[0:10] + expected = sparse_data.toarray().flatten().astype(float) + expected[expected == 0] = np.nan + np.testing.assert_array_almost_equal(result, expected) + + +def test_2d_sparse_write_batch(sample_dense_array_2d): + sparse_data = sp.random(10, 50, density=0.1, format="csr", dtype=np.float32) + + sample_dense_array_2d.write_batch(sparse_data, start_row=0) + result = sample_dense_array_2d[0:10, :] + expected = sparse_data.toarray().astype(float) + expected[expected == 0] = np.nan + np.testing.assert_array_almost_equal(result, expected) + + +def test_2d_sparse_write_with_offset(sample_dense_array_2d): + sparse_data = sp.random(10, 50, density=0.1, format="csr", dtype=np.float32) + + sample_dense_array_2d.write_batch(sparse_data, start_row=20) + result = sample_dense_array_2d[20:30, :] + expected = sparse_data.toarray().astype(float) + expected[expected == 0] = np.nan + np.testing.assert_array_almost_equal(result, expected) + + result_before = sample_dense_array_2d[0:10, :] + assert np.all(np.isnan(result_before)) + + +def test_2d_mixed_dense_sparse_writes(sample_dense_array_2d): + dense_data = np.ones((10, 50), dtype=np.float32) + sample_dense_array_2d.write_batch(dense_data, start_row=0) + + sparse_data = sp.csr_matrix(([99.0], ([5], [5])), shape=(10, 50), dtype=np.float32) + + sample_dense_array_2d.write_batch(sparse_data, start_row=0) + result = sample_dense_array_2d[0:10, :] + expected = np.ones((10, 50), dtype=np.float32) + expected[5, 5] = 99.0 + + np.testing.assert_array_almost_equal(result, expected) + + +def test_2d_sparse_bounds_check(sample_dense_array_2d): + sparse_data_rows = sp.random(150, 50, format="csr", dtype=np.float32) + with pytest.raises(ValueError, match="would exceed array bounds"): + sample_dense_array_2d.write_batch(sparse_data_rows, start_row=0) + + sparse_data_cols = sp.random(10, 60, format="csr", dtype=np.float32) + with pytest.raises(ValueError, match="Data columns"): + sample_dense_array_2d.write_batch(sparse_data_cols, start_row=0) + + def test_1d_slicing(sample_dense_array_1d): data = np.random.random(100).astype(np.float32) sample_dense_array_1d.write_batch(data, start_row=0)