Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,13 @@ jobs:
cd $RUNNER_TEMP
pytest -vv --showlocals $PROJECT_CWD

- name: "Re-run tests with pandas 2"
run: |
PROJECT_CWD=$PWD
pip install "pandas>=2,<3"
cd $RUNNER_TEMP
pytest -vv --showlocals $PROJECT_CWD

- name: "Re-run tests without pandas"
run: |
PROJECT_CWD=$PWD
Expand Down
13 changes: 3 additions & 10 deletions examples/incomplete_iteration.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,21 +40,14 @@


def check_dataframe_deps():
pd_error = """Pandas version >= 1.0 and < 3.0 required for dataframe functionality.
Please `pip install pandas>=1.0,<3.0` to proceed."""
pd_error = """Pandas is required for dataframe functionality.
Please `pip install pandas` to proceed."""

try:
import pandas as pd
import pandas
except ImportError:
raise Exception(pd_error)

from packaging.version import Version

if Version(pd.__version__) < Version("1.0") or Version(pd.__version__) >= Version(
"3.0.0.dev0"
):
raise Exception(pd_error)


# Name of the array to create.
array_name = "incomplete_iteration"
Expand Down
13 changes: 3 additions & 10 deletions examples/parallel_csv_ingestion.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,21 +49,14 @@


def check_dataframe_deps():
pd_error = """Pandas version >= 1.0 and < 3.0 required for dataframe functionality.
Please `pip install pandas>=1.0,<3.0` to proceed."""
pd_error = """Pandas is required for dataframe functionality.
Please `pip install pandas` to proceed."""

try:
import pandas as pd
import pandas
except ImportError:
raise Exception(pd_error)

from packaging.version import Version

if Version(pd.__version__) < Version("1.0") or Version(pd.__version__) >= Version(
"3.0.0.dev0"
):
raise Exception(pd_error)


def generate_csvs(csv_folder, count=9, min_length=1, max_length=109):
def make_dataframe(col_size):
Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ test = [
"hypothesis",
"psutil",
"pyarrow",
"pandas<3",
"pandas",
"dask[distributed]",
]

Expand Down Expand Up @@ -118,6 +118,6 @@ test-requires = [
"hypothesis",
"psutil",
"pyarrow",
"pandas<3",
"pandas",
]
test-command = "pytest {project}"
53 changes: 28 additions & 25 deletions tiledb/dataframe_.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,28 +15,18 @@


def check_dataframe_deps():
pd_error = """Pandas version >= 1.0 and < 3.0 required for dataframe functionality.
Please `pip install pandas>=1.0,<3.0` to proceed."""
pa_error = """PyArrow version >= 1.0 is suggested for dataframe functionality.
Please `pip install pyarrow>=1.0`."""
pd_error = """Pandas is required for dataframe functionality.
Please `pip install pandas` to proceed."""
pa_error = """PyArrow is suggested for dataframe functionality.
Please `pip install pyarrow`."""

try:
import pandas as pd
import pandas
except ImportError:
raise Exception(pd_error)

from packaging.version import Version

if Version(pd.__version__) < Version("1.0") or Version(pd.__version__) >= Version(
"3.0.0.dev0"
):
raise Exception(pd_error)

try:
import pyarrow as pa

if Version(pa.__version__) < Version("1.0"):
warnings.warn(pa_error)
import pyarrow
except ImportError:
warnings.warn(pa_error)

Expand Down Expand Up @@ -154,7 +144,7 @@ class ColumnInfo:

@classmethod
def from_values(cls, array_like, varlen_types=()):
from pandas import CategoricalDtype
from pandas import CategoricalDtype, StringDtype
from pandas.api import types as pd_types

if pd_types.is_object_dtype(array_like):
Expand All @@ -171,6 +161,16 @@ def from_values(cls, array_like, varlen_types=()):
raise NotImplementedError(
f"{inferred_dtype} inferred dtype not supported (column {array_like.name})"
)
elif hasattr(array_like, "dtype") and isinstance(array_like.dtype, StringDtype):
# Explicit pd.StringDtype() (name="string") is always nullable;
# auto-inferred str (name="str") depends on data
explicit = array_like.dtype.name == "string"
return cls(
np.dtype(np.str_),
repr="string" if explicit else None,
var=True,
nullable=explicit or bool(array_like.isna().any()),
)
elif hasattr(array_like, "dtype") and isinstance(
array_like.dtype, CategoricalDtype
):
Expand Down Expand Up @@ -211,6 +211,14 @@ def from_dtype(cls, dtype, column_name, varlen_types=()):
dtype = pd_types.pandas_dtype(dtype)
# Note: be careful if you rearrange the order of the following checks

# pandas StringDtype (auto-inferred 'str' and explicit 'string')
from pandas import StringDtype

if isinstance(dtype, StringDtype):
repr_val = "string" if dtype.name == "string" else None
nullable = dtype.name == "string"
return cls(np.dtype(np.str_), repr=repr_val, var=True, nullable=nullable)

# extension types
if pd_types.is_extension_array_dtype(dtype):
if libtiledb_version() < (2, 10) and pd_types.is_bool_dtype(dtype):
Expand Down Expand Up @@ -255,12 +263,7 @@ def from_dtype(cls, dtype, column_name, varlen_types=()):

# datetime types
if pd_types.is_datetime64_any_dtype(dtype):
if dtype == "datetime64[ns]":
return cls(dtype)
else:
raise NotImplementedError(
f"Only 'datetime64[ns]' datetime dtype is supported (column {column_name})"
)
return cls(dtype)

# string types
# don't use pd_types.is_string_dtype() because it includes object types too
Expand Down Expand Up @@ -517,8 +520,8 @@ def _df_to_np_arrays(df, column_infos, fillna):
if not column_info.var:
to_numpy_kwargs.update(dtype=column_info.dtype)

if column_info.nullable:
# use default 0/empty for the dtype
if column_info.nullable and column.isna().any():
# Only create nullmap if data actually has nulls
to_numpy_kwargs.update(na_value=column_info.dtype.type())
nullmaps[name] = (~column.isna()).to_numpy(dtype=np.uint8)

Expand Down
8 changes: 8 additions & 0 deletions tiledb/dense_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -481,6 +481,14 @@ def _setitem_impl(self, selection, val, nullmaps: dict):

try:
if attr.isvar:
# Capture null mask before np.asarray() loses pandas NA info
if (
attr.isnullable
and name not in nullmaps
and hasattr(attr_val, "isna")
):
nullmaps[name] = (~attr_val.isna()).to_numpy(dtype=np.uint8)

# ensure that the value is array-convertible, for example: pandas.Series
attr_val = np.asarray(attr_val)
if attr.isnullable and name not in nullmaps:
Expand Down
11 changes: 10 additions & 1 deletion tiledb/multirange_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -890,7 +890,16 @@ def _update_df_from_meta(
col_dtypes[name] = dtype

if col_dtypes:
df = df.astype(col_dtypes, copy=False)
# '<U0' is stored in __pandas_index_dims metadata for var-length string
# dimensions (str(np.dtype(np.str_)) == '<U0>'). Applying astype('<U0')
# was a no-op on pandas 2 but on pandas 3 it forces StringDtype back to
# object, breaking the roundtrip. The string data already has the correct
# dtype from pandas' own inference, so we skip it here.
col_dtypes = {
name: dtype for name, dtype in col_dtypes.items() if dtype != "<U0"
}
if col_dtypes:
df = df.astype(col_dtypes)

if index_col:
if index_col is not True:
Expand Down
9 changes: 9 additions & 0 deletions tiledb/sparse_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,15 @@ def _setitem_impl_sparse(self, selection, val, nullmaps: dict):
attr_val = val[name]

try:
# Capture null mask before np.asarray() loses pandas NA info
if (
attr.isvar
and attr.isnullable
and name not in nullmaps
and hasattr(attr_val, "isna")
):
nullmaps[name] = (~attr_val.isna()).to_numpy(dtype=np.uint8)

# ensure that the value is array-convertible, for example: pandas.Series
attr_val = np.asarray(attr_val)

Expand Down
11 changes: 3 additions & 8 deletions tiledb/tests/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,17 +26,12 @@

def has_pandas():
try:
import pandas as pd
except ImportError:
return False
import pandas

if Version(pd.__version__) < Version("1.0") or Version(pd.__version__) >= Version(
"3.0.0.dev0"
):
return True
except ImportError:
return False

return True


def has_pyarrow():
try:
Expand Down
8 changes: 7 additions & 1 deletion tiledb/tests/datatypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,11 +48,17 @@ def __len__(self):
return len(self._flat_arrays)

def __getitem__(self, i):
return self._flat_arrays[i]
if isinstance(i, (int, np.integer)):
return self._flat_arrays[i]
return type(self)(self._flat_arrays[i], self._dtype)

@property
def dtype(self):
return self._dtype

def copy(self):
return type(self)(self._flat_arrays, self._dtype)

@property
def ndim(self):
return 1
4 changes: 2 additions & 2 deletions tiledb/tests/test_enumeration.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ def test_array_schema_enumeration(self):

@pytest.mark.skipif(
not has_pyarrow() or not has_pandas(),
reason="pyarrow>=1.0 and/or pandas>=1.0,<3.0 not installed",
reason="pyarrow>=1.0 and/or pandas not installed",
)
@pytest.mark.parametrize("sparse", [True, False])
@pytest.mark.parametrize("pass_df", [True, False])
Expand Down Expand Up @@ -185,7 +185,7 @@ def test_enum_dtypes(self, dtype, values):
assert enmr.dtype == enmr.values().dtype == dtype
assert_array_equal(enmr.values(), values)

@pytest.mark.skipif(not has_pandas(), reason="pandas>=1.0,<3.0 not installed")
@pytest.mark.skipif(not has_pandas(), reason="pandas not installed")
def test_from_pandas_dtype_mismatch(self):
import pandas as pd

Expand Down
7 changes: 3 additions & 4 deletions tiledb/tests/test_examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def test_examples(self, path):
]
]
if not has_pandas() and path in requires_pd:
pytest.mark.skip("pandas>=1.0,<3.0 not installed")
pytest.mark.skip("pandas not installed")
else:
with tempfile.TemporaryDirectory() as tmpdir:
try:
Expand Down Expand Up @@ -73,10 +73,9 @@ def test_docs(self, capsys):
if failures:
stderr = capsys.readouterr().out
if "No module named 'pandas'" in stderr or (
"Pandas version >= 1.0 and < 3.0 required for dataframe functionality"
in stderr
"Pandas is required for dataframe functionality" in stderr
and not has_pandas()
):
pytest.skip("pandas>=1.0,<3.0 not installed")
pytest.skip("pandas not installed")
else:
pytest.fail(stderr)
6 changes: 3 additions & 3 deletions tiledb/tests/test_fixes.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def test_ch8292(self):
buffers = list(*q._get_buffers().values())
assert buffers[0].nbytes == max_val

@pytest.mark.skipif(not has_pandas(), reason="pandas>=1.0,<3.0 not installed")
@pytest.mark.skipif(not has_pandas(), reason="pandas not installed")
def test_ch10282_concurrent_multi_index(self):
"""Test concurrent access to a single tiledb.Array using
Array.multi_index and Array.df. We pass an array and slice
Expand Down Expand Up @@ -230,7 +230,7 @@ def test_fix_stats_error_messages(self):

@pytest.mark.skipif(
not has_pandas() and has_pyarrow(),
reason="pandas>=1.0,<3.0 or pyarrow>=1.0 not installed",
reason="pandas or pyarrow>=1.0 not installed",
)
def test_py1078_df_all_empty_strings(self):
uri = self.path()
Expand All @@ -246,7 +246,7 @@ def test_py1078_df_all_empty_strings(self):
with tiledb.open(uri) as arr:
tm.assert_frame_equal(arr.df[:], df)

@pytest.mark.skipif(not has_pandas(), reason="pandas>=1.0,<3.0 not installed")
@pytest.mark.skipif(not has_pandas(), reason="pandas not installed")
@pytest.mark.parametrize("is_sparse", [True, False])
def test_sc1430_nonexisting_timestamp(self, is_sparse):
path = self.path("nonexisting_timestamp")
Expand Down
2 changes: 1 addition & 1 deletion tiledb/tests/test_hypothesis.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
tm = pd._testing


@pytest.mark.skipif(not has_pandas(), reason="pandas>=1.0,<3.0 not installed")
@pytest.mark.skipif(not has_pandas(), reason="pandas not installed")
@pytest.mark.parametrize("mode", ["np", "df"])
@hp.settings(deadline=None, verbosity=hp.Verbosity.verbose)
@hp.given(st.binary())
Expand Down
8 changes: 4 additions & 4 deletions tiledb/tests/test_libtiledb.py
Original file line number Diff line number Diff line change
Expand Up @@ -418,7 +418,7 @@ def test_array_delete(self):

@pytest.mark.skipif(
not has_pyarrow() or not has_pandas(),
reason="pyarrow>=1.0 and/or pandas>=1.0,<3.0 not installed",
reason="pyarrow>=1.0 and/or pandas not installed",
)
@pytest.mark.parametrize("sparse", [True, False])
@pytest.mark.parametrize("pass_df", [True, False])
Expand Down Expand Up @@ -1784,7 +1784,7 @@ def test_query_real_multi_index(self, fx_sparse_cell_order):
"coords" not in T.query(coords=False).multi_index[-10.0:5.0]
)

@pytest.mark.skipif(not has_pandas(), reason="pandas>=1.0,<3.0 not installed")
@pytest.mark.skipif(not has_pandas(), reason="pandas not installed")
@pytest.mark.parametrize("dtype", ["u1", "u2", "u4", "u8", "i1", "i2", "i4", "i8"])
def test_sparse_index_dtypes(self, dtype):
path = self.path()
Expand All @@ -1805,7 +1805,7 @@ def test_sparse_index_dtypes(self, dtype):
assert B[data[1]]["attr"] == data[1]
assert B.multi_index[data[0]]["attr"] == data[0]

@pytest.mark.skipif(not has_pandas(), reason="pandas>=1.0,<3.0 not installed")
@pytest.mark.skipif(not has_pandas(), reason="pandas not installed")
@pytest.mark.skipif(
tiledb.libtiledb.version() < (2, 10),
reason="TILEDB_BOOL introduced in libtiledb 2.10",
Expand Down Expand Up @@ -3743,7 +3743,7 @@ def test_query_return_incomplete_error(self, sparse):
with self.assertRaises(tiledb.TileDBError):
A.query(return_incomplete=True)[:]

@pytest.mark.skipif(not has_pandas(), reason="pandas>=1.0,<3.0 not installed")
@pytest.mark.skipif(not has_pandas(), reason="pandas not installed")
@pytest.mark.parametrize(
"use_arrow, return_arrow, indexer",
[
Expand Down
Loading
Loading