From 95722c398694e5e088b8bf73341621149360be7e Mon Sep 17 00:00:00 2001 From: Agisilaos Kounelis Date: Mon, 9 Feb 2026 15:04:53 +0200 Subject: [PATCH] Fix var-length write for equal-length sub-arrays --- tiledb/array.py | 16 ++++++- tiledb/sparse_array.py | 8 +++- tiledb/tests/test_libtiledb.py | 80 ++++++++++++++++++++++++++++++++++ 3 files changed, 101 insertions(+), 3 deletions(-) diff --git a/tiledb/array.py b/tiledb/array.py index cd9b3c72b4..b00c46fca4 100644 --- a/tiledb/array.py +++ b/tiledb/array.py @@ -869,7 +869,21 @@ def _write_array( attr_val = np.nan_to_num(values[i]) else: attr_val = values[i] - buffer, offsets = array_to_buffer(attr_val, True, False) + + # Numpy coalesces equal-length sub-arrays into N-D + if ( + attr_val.ndim > 1 + and attr_val.size > 0 + and not isinstance(attr_val.flat[0], (np.ndarray, str, bytes)) + ): + attr_val = np.ascontiguousarray(attr_val, dtype=attr.dtype) + n = attr_val.shape[0] + offsets = np.arange(n, dtype=np.uint64) * np.uint64( + attr_val.strides[0] + ) + buffer = attr_val.ravel().view(np.uint8) + else: + buffer, offsets = array_to_buffer(attr_val, True, False) except Exception as exc: raise type(exc)( f"Failed to convert buffer for attribute: '{attr.name}'" diff --git a/tiledb/sparse_array.py b/tiledb/sparse_array.py index 70be2c86a8..6dab2c0140 100644 --- a/tiledb/sparse_array.py +++ b/tiledb/sparse_array.py @@ -174,10 +174,14 @@ def _setitem_impl_sparse(self, selection, val, nullmaps: dict): ) ncells = sparse_coords[0].shape[0] - if attr_val.size != ncells: + # For var-length attributes, numpy may coalesce equal-length + # sub-arrays into a higher-dimensional array (e.g. shape (n, m) + # instead of (n,) with dtype=object). Use shape[0] in that case. + nvals = attr_val.shape[0] if attr.isvar and attr_val.ndim > 1 else attr_val.size + if nvals != ncells: raise ValueError( "value length ({}) does not match " - "coordinate length ({})".format(attr_val.size, ncells) + "coordinate length ({})".format(nvals, ncells) ) sparse_attributes.append(attr._internal_name) sparse_values.append(attr_val) diff --git a/tiledb/tests/test_libtiledb.py b/tiledb/tests/test_libtiledb.py index 07d852ba5b..7594fd72ee 100644 --- a/tiledb/tests/test_libtiledb.py +++ b/tiledb/tests/test_libtiledb.py @@ -1419,6 +1419,39 @@ def test_varlen_write_floats(self): # can't use assert_array_equal w/ object array self.assertTrue(all(np.array_equal(x, A[i]) for i, x in enumerate(T_))) + def test_varlen_write_homogeneous_subarrays(self): + """Test writing var-length attributes where all sub-arrays have the + same length. numpy coalesces these into a 2D array which previously + caused errors. See https://github.com/TileDB-Inc/TileDB-Py/issues/494 + """ + # All sub-arrays have length 3 — numpy will coalesce into shape (4, 3) + A = np.array( + [ + np.array([1, 2, 9], dtype=np.int64), + np.array([3, 4, 5], dtype=np.int64), + np.array([7, 8, 6], dtype=np.int64), + np.array([10, 11, 12], dtype=np.int64), + ], + dtype="O", + ) + + dom = tiledb.Domain(tiledb.Dim(domain=(1, 4), tile=4)) + att = tiledb.Attr(name="val", dtype=np.int64, var=True) + schema = tiledb.ArraySchema(dom, (att,)) + tiledb.DenseArray.create(self.path("homogeneous_varlen"), schema) + + with tiledb.DenseArray(self.path("homogeneous_varlen"), mode="w") as T: + T[:] = {"val": A} + + with tiledb.DenseArray(self.path("homogeneous_varlen"), mode="r") as T: + res = T[:]["val"] + expected = np.empty(4, dtype=object) + expected[0] = np.array([1, 2, 9], dtype=np.int64) + expected[1] = np.array([3, 4, 5], dtype=np.int64) + expected[2] = np.array([7, 8, 6], dtype=np.int64) + expected[3] = np.array([10, 11, 12], dtype=np.int64) + assert_subarrays_equal(res, expected) + def test_varlen_write_floats_2d(self): A = np.array( [np.random.rand(x) for x in np.arange(1, 10)], dtype=object @@ -2249,6 +2282,53 @@ def test_sparse_2d_varlen_int(self, fx_sparse_cell_order): assert_unordered_equal(res["__dim_0"], c1) assert_unordered_equal(res["__dim_1"], c2) + @pytest.mark.parametrize( + "dtype,use_object_dtype", + [ + (np.int64, True), + (np.int64, False), + (np.int32, True), + (np.float32, True), + (np.float64, False), + (np.uint32, True), + ], + ) + def test_sparse_varlen_homogeneous_subarrays( + self, fx_sparse_cell_order, dtype, use_object_dtype + ): + """Test writing var-length attributes where all sub-arrays have the + same length. numpy coalesces these into a 2D array which previously + caused a 'value length does not match coordinate length' error. + See https://github.com/TileDB-Inc/TileDB-Py/issues/494 + """ + path = self.path("test_sparse_varlen_homogeneous_subarrays") + dom = tiledb.Domain(tiledb.Dim(domain=(0, 10), dtype=np.int64)) + att = tiledb.Attr(name="val", var=True, dtype=dtype) + schema = tiledb.ArraySchema( + dom, (att,), sparse=True, cell_order=fx_sparse_cell_order + ) + tiledb.SparseArray.create(path, schema) + + a = np.array([1, 2, 9], dtype=dtype) + b = np.array([3, 4, 5], dtype=dtype) + + if use_object_dtype: + # User explicitly passes dtype='O'; becomes 2D after dtype conversion + vals = np.array([a, b], dtype="O") + else: + # User has no control over dtype; numpy coalesces to 2D native + vals = np.array([a, b]) + + with tiledb.SparseArray(path, "w") as A: + A[[1, 2]] = {"val": vals} + + with tiledb.SparseArray(path, "r") as A: + res = A[:] + expected = np.empty(2, dtype=object) + expected[0] = a + expected[1] = b + assert_subarrays_equal(res["val"], expected) + def test_sparse_mixed_domain_uint_float64(self, fx_sparse_cell_order): path = self.path("mixed_domain_uint_float64") dims = [