Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 15 additions & 1 deletion tiledb/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -869,7 +869,21 @@ def _write_array(
attr_val = np.nan_to_num(values[i])
else:
attr_val = values[i]
buffer, offsets = array_to_buffer(attr_val, True, False)

# Numpy coalesces equal-length sub-arrays into N-D
if (
attr_val.ndim > 1
and attr_val.size > 0
and not isinstance(attr_val.flat[0], (np.ndarray, str, bytes))
):
attr_val = np.ascontiguousarray(attr_val, dtype=attr.dtype)
n = attr_val.shape[0]
offsets = np.arange(n, dtype=np.uint64) * np.uint64(
attr_val.strides[0]
)
buffer = attr_val.ravel().view(np.uint8)
else:
buffer, offsets = array_to_buffer(attr_val, True, False)
except Exception as exc:
raise type(exc)(
f"Failed to convert buffer for attribute: '{attr.name}'"
Expand Down
8 changes: 6 additions & 2 deletions tiledb/sparse_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,10 +174,14 @@ def _setitem_impl_sparse(self, selection, val, nullmaps: dict):
)

ncells = sparse_coords[0].shape[0]
if attr_val.size != ncells:
# For var-length attributes, numpy may coalesce equal-length
# sub-arrays into a higher-dimensional array (e.g. shape (n, m)
# instead of (n,) with dtype=object). Use shape[0] in that case.
nvals = attr_val.shape[0] if attr.isvar and attr_val.ndim > 1 else attr_val.size
if nvals != ncells:
raise ValueError(
"value length ({}) does not match "
"coordinate length ({})".format(attr_val.size, ncells)
"coordinate length ({})".format(nvals, ncells)
)
sparse_attributes.append(attr._internal_name)
sparse_values.append(attr_val)
Expand Down
80 changes: 80 additions & 0 deletions tiledb/tests/test_libtiledb.py
Original file line number Diff line number Diff line change
Expand Up @@ -1419,6 +1419,39 @@ def test_varlen_write_floats(self):
# can't use assert_array_equal w/ object array
self.assertTrue(all(np.array_equal(x, A[i]) for i, x in enumerate(T_)))

def test_varlen_write_homogeneous_subarrays(self):
"""Test writing var-length attributes where all sub-arrays have the
same length. numpy coalesces these into a 2D array which previously
caused errors. See https://github.com/TileDB-Inc/TileDB-Py/issues/494
"""
# All sub-arrays have length 3 — numpy will coalesce into shape (4, 3)
A = np.array(
[
np.array([1, 2, 9], dtype=np.int64),
np.array([3, 4, 5], dtype=np.int64),
np.array([7, 8, 6], dtype=np.int64),
np.array([10, 11, 12], dtype=np.int64),
],
dtype="O",
)

dom = tiledb.Domain(tiledb.Dim(domain=(1, 4), tile=4))
att = tiledb.Attr(name="val", dtype=np.int64, var=True)
schema = tiledb.ArraySchema(dom, (att,))
tiledb.DenseArray.create(self.path("homogeneous_varlen"), schema)

with tiledb.DenseArray(self.path("homogeneous_varlen"), mode="w") as T:
T[:] = {"val": A}

with tiledb.DenseArray(self.path("homogeneous_varlen"), mode="r") as T:
res = T[:]["val"]
expected = np.empty(4, dtype=object)
expected[0] = np.array([1, 2, 9], dtype=np.int64)
expected[1] = np.array([3, 4, 5], dtype=np.int64)
expected[2] = np.array([7, 8, 6], dtype=np.int64)
expected[3] = np.array([10, 11, 12], dtype=np.int64)
assert_subarrays_equal(res, expected)

def test_varlen_write_floats_2d(self):
A = np.array(
[np.random.rand(x) for x in np.arange(1, 10)], dtype=object
Expand Down Expand Up @@ -2249,6 +2282,53 @@ def test_sparse_2d_varlen_int(self, fx_sparse_cell_order):
assert_unordered_equal(res["__dim_0"], c1)
assert_unordered_equal(res["__dim_1"], c2)

@pytest.mark.parametrize(
"dtype,use_object_dtype",
[
(np.int64, True),
(np.int64, False),
(np.int32, True),
(np.float32, True),
(np.float64, False),
(np.uint32, True),
],
)
def test_sparse_varlen_homogeneous_subarrays(
self, fx_sparse_cell_order, dtype, use_object_dtype
):
"""Test writing var-length attributes where all sub-arrays have the
same length. numpy coalesces these into a 2D array which previously
caused a 'value length does not match coordinate length' error.
See https://github.com/TileDB-Inc/TileDB-Py/issues/494
"""
path = self.path("test_sparse_varlen_homogeneous_subarrays")
dom = tiledb.Domain(tiledb.Dim(domain=(0, 10), dtype=np.int64))
att = tiledb.Attr(name="val", var=True, dtype=dtype)
schema = tiledb.ArraySchema(
dom, (att,), sparse=True, cell_order=fx_sparse_cell_order
)
tiledb.SparseArray.create(path, schema)

a = np.array([1, 2, 9], dtype=dtype)
b = np.array([3, 4, 5], dtype=dtype)

if use_object_dtype:
# User explicitly passes dtype='O'; becomes 2D after dtype conversion
vals = np.array([a, b], dtype="O")
else:
# User has no control over dtype; numpy coalesces to 2D native
vals = np.array([a, b])

with tiledb.SparseArray(path, "w") as A:
A[[1, 2]] = {"val": vals}

with tiledb.SparseArray(path, "r") as A:
res = A[:]
expected = np.empty(2, dtype=object)
expected[0] = a
expected[1] = b
assert_subarrays_equal(res["val"], expected)

def test_sparse_mixed_domain_uint_float64(self, fx_sparse_cell_order):
path = self.path("mixed_domain_uint_float64")
dims = [
Expand Down
Loading