Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
# Changelog

## Version 0.3.0 - 0.3.2
## Version 0.3.0 - 0.3.3

- Support for string dimensions when creating cellarr arrays.
- Support query conditions for slice operations.
- Support sparse writes on dense arrays.
- Added unique dim values. Only supported for sparse arrays.
- Fix a minor bug causing memory leaks on large sparse arrays.
- Fix an issue when domain is max dimension.
Expand Down
61 changes: 45 additions & 16 deletions src/cellarr_array/core/dense.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from typing import List, Tuple, Union

import numpy as np
from scipy import sparse as sp

from .base import CellArray
from .helpers import SliceHelper
Expand Down Expand Up @@ -69,12 +70,16 @@ def _multi_index(self, key: Tuple[Union[slice, List[int]], ...]) -> np.ndarray:
res = array.multi_index[tuple(tiledb_key)]
return res[self._attr] if self._attr is not None else res

def write_batch(self, data: np.ndarray, start_row: int, **kwargs) -> None:
def write_batch(self, data: Union[np.ndarray, sp.spmatrix], start_row: int, **kwargs) -> None:
"""Write a batch of data to the dense array.

This method supports both dense (numpy.ndarray) and sparse
(scipy.sparse.spmatrix) inputs.

Args:
data:
Numpy array to write.
Numpy array (for dense write) or Scipy sparse matrix
(for sparse write) to write.

start_row:
Starting row index for writing.
Expand All @@ -83,14 +88,9 @@ def write_batch(self, data: np.ndarray, start_row: int, **kwargs) -> None:
Additional arguments passed to TileDB write operation.

Raises:
TypeError: If input is not a numpy array.
TypeError: If input is not a numpy array or sparse matrix.
ValueError: If dimensions don't match or bounds are exceeded.
"""
if not isinstance(data, np.ndarray):
raise TypeError("Input must be a numpy array.")

if len(data.shape) != self.ndim:
raise ValueError(f"Data dimensions {data.shape} don't match array dimensions {self.shape}.")

end_row = start_row + data.shape[0]
if end_row > self.shape[0]:
Expand All @@ -100,13 +100,42 @@ def write_batch(self, data: np.ndarray, start_row: int, **kwargs) -> None:

if self.ndim == 2 and data.shape[1] != self.shape[1]:
raise ValueError(f"Data columns {data.shape[1]} don't match array columns {self.shape[1]}.")
elif self.ndim == 1 and data.ndim > 1 and data.shape[1] != 1:
raise ValueError(f"1D array expects (N, 1) matrix, got {data.shape}")

if self.ndim == 1:
write_region = slice(start_row, end_row)
else: # 2D
write_region = (slice(start_row, end_row), slice(0, self.shape[1]))
if isinstance(data, np.ndarray):
if len(data.shape) != self.ndim:
raise ValueError(f"Data dimensions {data.shape} don't match array dimensions {self.shape}.")

# write_data = {self._attr: data} if len(self.attr_names) > 1 else data
with self.open_array(mode="w") as array:
print("write_region", write_region)
array[write_region] = data
if self.ndim == 1:
write_region = slice(start_row, end_row)
else:
write_region = (slice(start_row, end_row), slice(0, self.shape[1]))

with self.open_array(mode="w") as array:
array[write_region] = data

elif sp.issparse(data):
coo_data = data.tocoo() if not isinstance(data, sp.coo_matrix) else data
is_1d = self.ndim == 1
if is_1d:
if coo_data.shape[0] == 1: # Convert (1,N) to (N,1)
coo_data = sp.coo_matrix(
(coo_data.data, (coo_data.col, np.zeros_like(coo_data.col))), shape=(coo_data.shape[1], 1)
)
elif coo_data.shape[1] != 1:
raise ValueError(f"1D array expects (N, 1) matrix, got {coo_data.shape}")

with self.open_array(mode="w") as array:
if is_1d:
for r, val in zip(coo_data.row, coo_data.data):
# row_idx = r + start_row
array[r : r + 1] = val
else:
for r, c, val in zip(coo_data.row, coo_data.col, coo_data.data):
row_idx = r + start_row
col_idx = c
array[row_idx : row_idx + 1, col_idx : col_idx + 1] = val

else:
raise TypeError("Input must be a numpy array or a scipy sparse matrix.")
60 changes: 60 additions & 0 deletions tests/test_dense.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import numpy as np
import pytest
import scipy.sparse as sp
import tiledb

from cellarr_array import DenseCellArray, create_cellarray
Expand Down Expand Up @@ -65,6 +66,65 @@ def test_2d_bounds_check(sample_dense_array_2d):
sample_dense_array_2d.write_batch(data, start_row=0)


def test_1d_sparse_write_batch(sample_dense_array_1d):
data_points = np.array([1.1, 2.2, 3.3], dtype=np.float32)
indices = np.array([2, 5, 8])
sparse_data = sp.csr_matrix((data_points, (indices, np.zeros(3))), shape=(10, 1))

sample_dense_array_1d.write_batch(sparse_data, start_row=0)
result = sample_dense_array_1d[0:10]
expected = sparse_data.toarray().flatten().astype(float)
expected[expected == 0] = np.nan
np.testing.assert_array_almost_equal(result, expected)


def test_2d_sparse_write_batch(sample_dense_array_2d):
sparse_data = sp.random(10, 50, density=0.1, format="csr", dtype=np.float32)

sample_dense_array_2d.write_batch(sparse_data, start_row=0)
result = sample_dense_array_2d[0:10, :]
expected = sparse_data.toarray().astype(float)
expected[expected == 0] = np.nan
np.testing.assert_array_almost_equal(result, expected)


def test_2d_sparse_write_with_offset(sample_dense_array_2d):
sparse_data = sp.random(10, 50, density=0.1, format="csr", dtype=np.float32)

sample_dense_array_2d.write_batch(sparse_data, start_row=20)
result = sample_dense_array_2d[20:30, :]
expected = sparse_data.toarray().astype(float)
expected[expected == 0] = np.nan
np.testing.assert_array_almost_equal(result, expected)

result_before = sample_dense_array_2d[0:10, :]
assert np.all(np.isnan(result_before))


def test_2d_mixed_dense_sparse_writes(sample_dense_array_2d):
dense_data = np.ones((10, 50), dtype=np.float32)
sample_dense_array_2d.write_batch(dense_data, start_row=0)

sparse_data = sp.csr_matrix(([99.0], ([5], [5])), shape=(10, 50), dtype=np.float32)

sample_dense_array_2d.write_batch(sparse_data, start_row=0)
result = sample_dense_array_2d[0:10, :]
expected = np.ones((10, 50), dtype=np.float32)
expected[5, 5] = 99.0

np.testing.assert_array_almost_equal(result, expected)


def test_2d_sparse_bounds_check(sample_dense_array_2d):
sparse_data_rows = sp.random(150, 50, format="csr", dtype=np.float32)
with pytest.raises(ValueError, match="would exceed array bounds"):
sample_dense_array_2d.write_batch(sparse_data_rows, start_row=0)

sparse_data_cols = sp.random(10, 60, format="csr", dtype=np.float32)
with pytest.raises(ValueError, match="Data columns"):
sample_dense_array_2d.write_batch(sparse_data_cols, start_row=0)


def test_1d_slicing(sample_dense_array_1d):
data = np.random.random(100).astype(np.float32)
sample_dense_array_1d.write_batch(data, start_row=0)
Expand Down