diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5802d7fda1..9884102931 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -156,6 +156,13 @@ jobs: cd $RUNNER_TEMP pytest -vv --showlocals $PROJECT_CWD + - name: "Re-run tests with pandas 2" + run: | + PROJECT_CWD=$PWD + pip install "pandas>=2,<3" + cd $RUNNER_TEMP + pytest -vv --showlocals $PROJECT_CWD + - name: "Re-run tests without pandas" run: | PROJECT_CWD=$PWD diff --git a/examples/incomplete_iteration.py b/examples/incomplete_iteration.py index 4166cff7d9..aa458a3c84 100644 --- a/examples/incomplete_iteration.py +++ b/examples/incomplete_iteration.py @@ -40,21 +40,14 @@ def check_dataframe_deps(): - pd_error = """Pandas version >= 1.0 and < 3.0 required for dataframe functionality. - Please `pip install pandas>=1.0,<3.0` to proceed.""" + pd_error = """Pandas is required for dataframe functionality. + Please `pip install pandas` to proceed.""" try: - import pandas as pd + import pandas except ImportError: raise Exception(pd_error) - from packaging.version import Version - - if Version(pd.__version__) < Version("1.0") or Version(pd.__version__) >= Version( - "3.0.0.dev0" - ): - raise Exception(pd_error) - # Name of the array to create. array_name = "incomplete_iteration" diff --git a/examples/parallel_csv_ingestion.py b/examples/parallel_csv_ingestion.py index 2038df7093..799f210f92 100644 --- a/examples/parallel_csv_ingestion.py +++ b/examples/parallel_csv_ingestion.py @@ -49,21 +49,14 @@ def check_dataframe_deps(): - pd_error = """Pandas version >= 1.0 and < 3.0 required for dataframe functionality. - Please `pip install pandas>=1.0,<3.0` to proceed.""" + pd_error = """Pandas is required for dataframe functionality. + Please `pip install pandas` to proceed.""" try: - import pandas as pd + import pandas except ImportError: raise Exception(pd_error) - from packaging.version import Version - - if Version(pd.__version__) < Version("1.0") or Version(pd.__version__) >= Version( - "3.0.0.dev0" - ): - raise Exception(pd_error) - def generate_csvs(csv_folder, count=9, min_length=1, max_length=109): def make_dataframe(col_size): diff --git a/pyproject.toml b/pyproject.toml index b8f88f2dd3..4fabff4fd0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -57,7 +57,7 @@ test = [ "hypothesis", "psutil", "pyarrow", - "pandas<3", + "pandas", "dask[distributed]", ] @@ -118,6 +118,6 @@ test-requires = [ "hypothesis", "psutil", "pyarrow", - "pandas<3", + "pandas", ] test-command = "pytest {project}" diff --git a/tiledb/dataframe_.py b/tiledb/dataframe_.py index 50755eccad..4f9d3afd78 100644 --- a/tiledb/dataframe_.py +++ b/tiledb/dataframe_.py @@ -15,28 +15,18 @@ def check_dataframe_deps(): - pd_error = """Pandas version >= 1.0 and < 3.0 required for dataframe functionality. - Please `pip install pandas>=1.0,<3.0` to proceed.""" - pa_error = """PyArrow version >= 1.0 is suggested for dataframe functionality. - Please `pip install pyarrow>=1.0`.""" + pd_error = """Pandas is required for dataframe functionality. + Please `pip install pandas` to proceed.""" + pa_error = """PyArrow is suggested for dataframe functionality. + Please `pip install pyarrow`.""" try: - import pandas as pd + import pandas except ImportError: raise Exception(pd_error) - from packaging.version import Version - - if Version(pd.__version__) < Version("1.0") or Version(pd.__version__) >= Version( - "3.0.0.dev0" - ): - raise Exception(pd_error) - try: - import pyarrow as pa - - if Version(pa.__version__) < Version("1.0"): - warnings.warn(pa_error) + import pyarrow except ImportError: warnings.warn(pa_error) @@ -154,7 +144,7 @@ class ColumnInfo: @classmethod def from_values(cls, array_like, varlen_types=()): - from pandas import CategoricalDtype + from pandas import CategoricalDtype, StringDtype from pandas.api import types as pd_types if pd_types.is_object_dtype(array_like): @@ -171,6 +161,16 @@ def from_values(cls, array_like, varlen_types=()): raise NotImplementedError( f"{inferred_dtype} inferred dtype not supported (column {array_like.name})" ) + elif hasattr(array_like, "dtype") and isinstance(array_like.dtype, StringDtype): + # Explicit pd.StringDtype() (name="string") is always nullable; + # auto-inferred str (name="str") depends on data + explicit = array_like.dtype.name == "string" + return cls( + np.dtype(np.str_), + repr="string" if explicit else None, + var=True, + nullable=explicit or bool(array_like.isna().any()), + ) elif hasattr(array_like, "dtype") and isinstance( array_like.dtype, CategoricalDtype ): @@ -211,6 +211,14 @@ def from_dtype(cls, dtype, column_name, varlen_types=()): dtype = pd_types.pandas_dtype(dtype) # Note: be careful if you rearrange the order of the following checks + # pandas StringDtype (auto-inferred 'str' and explicit 'string') + from pandas import StringDtype + + if isinstance(dtype, StringDtype): + repr_val = "string" if dtype.name == "string" else None + nullable = dtype.name == "string" + return cls(np.dtype(np.str_), repr=repr_val, var=True, nullable=nullable) + # extension types if pd_types.is_extension_array_dtype(dtype): if libtiledb_version() < (2, 10) and pd_types.is_bool_dtype(dtype): @@ -255,12 +263,7 @@ def from_dtype(cls, dtype, column_name, varlen_types=()): # datetime types if pd_types.is_datetime64_any_dtype(dtype): - if dtype == "datetime64[ns]": - return cls(dtype) - else: - raise NotImplementedError( - f"Only 'datetime64[ns]' datetime dtype is supported (column {column_name})" - ) + return cls(dtype) # string types # don't use pd_types.is_string_dtype() because it includes object types too @@ -517,8 +520,8 @@ def _df_to_np_arrays(df, column_infos, fillna): if not column_info.var: to_numpy_kwargs.update(dtype=column_info.dtype) - if column_info.nullable: - # use default 0/empty for the dtype + if column_info.nullable and column.isna().any(): + # Only create nullmap if data actually has nulls to_numpy_kwargs.update(na_value=column_info.dtype.type()) nullmaps[name] = (~column.isna()).to_numpy(dtype=np.uint8) diff --git a/tiledb/dense_array.py b/tiledb/dense_array.py index 71e086aaa4..41ed5aed80 100644 --- a/tiledb/dense_array.py +++ b/tiledb/dense_array.py @@ -481,6 +481,14 @@ def _setitem_impl(self, selection, val, nullmaps: dict): try: if attr.isvar: + # Capture null mask before np.asarray() loses pandas NA info + if ( + attr.isnullable + and name not in nullmaps + and hasattr(attr_val, "isna") + ): + nullmaps[name] = (~attr_val.isna()).to_numpy(dtype=np.uint8) + # ensure that the value is array-convertible, for example: pandas.Series attr_val = np.asarray(attr_val) if attr.isnullable and name not in nullmaps: diff --git a/tiledb/multirange_indexing.py b/tiledb/multirange_indexing.py index 537caf22dd..72cfb67bc9 100644 --- a/tiledb/multirange_indexing.py +++ b/tiledb/multirange_indexing.py @@ -890,7 +890,16 @@ def _update_df_from_meta( col_dtypes[name] = dtype if col_dtypes: - df = df.astype(col_dtypes, copy=False) + # ''). Applying astype('= Version( - "3.0.0.dev0" - ): + return True + except ImportError: return False - return True - def has_pyarrow(): try: diff --git a/tiledb/tests/datatypes.py b/tiledb/tests/datatypes.py index a8c077f1b9..ab253da93f 100644 --- a/tiledb/tests/datatypes.py +++ b/tiledb/tests/datatypes.py @@ -48,7 +48,9 @@ def __len__(self): return len(self._flat_arrays) def __getitem__(self, i): - return self._flat_arrays[i] + if isinstance(i, (int, np.integer)): + return self._flat_arrays[i] + return type(self)(self._flat_arrays[i], self._dtype) @property def dtype(self): @@ -56,3 +58,7 @@ def dtype(self): def copy(self): return type(self)(self._flat_arrays, self._dtype) + + @property + def ndim(self): + return 1 diff --git a/tiledb/tests/test_enumeration.py b/tiledb/tests/test_enumeration.py index 00ed043fd0..7d7c8e5382 100644 --- a/tiledb/tests/test_enumeration.py +++ b/tiledb/tests/test_enumeration.py @@ -111,7 +111,7 @@ def test_array_schema_enumeration(self): @pytest.mark.skipif( not has_pyarrow() or not has_pandas(), - reason="pyarrow>=1.0 and/or pandas>=1.0,<3.0 not installed", + reason="pyarrow>=1.0 and/or pandas not installed", ) @pytest.mark.parametrize("sparse", [True, False]) @pytest.mark.parametrize("pass_df", [True, False]) @@ -185,7 +185,7 @@ def test_enum_dtypes(self, dtype, values): assert enmr.dtype == enmr.values().dtype == dtype assert_array_equal(enmr.values(), values) - @pytest.mark.skipif(not has_pandas(), reason="pandas>=1.0,<3.0 not installed") + @pytest.mark.skipif(not has_pandas(), reason="pandas not installed") def test_from_pandas_dtype_mismatch(self): import pandas as pd diff --git a/tiledb/tests/test_examples.py b/tiledb/tests/test_examples.py index 1c3f18a899..10793e36d4 100644 --- a/tiledb/tests/test_examples.py +++ b/tiledb/tests/test_examples.py @@ -43,7 +43,7 @@ def test_examples(self, path): ] ] if not has_pandas() and path in requires_pd: - pytest.mark.skip("pandas>=1.0,<3.0 not installed") + pytest.mark.skip("pandas not installed") else: with tempfile.TemporaryDirectory() as tmpdir: try: @@ -73,10 +73,9 @@ def test_docs(self, capsys): if failures: stderr = capsys.readouterr().out if "No module named 'pandas'" in stderr or ( - "Pandas version >= 1.0 and < 3.0 required for dataframe functionality" - in stderr + "Pandas is required for dataframe functionality" in stderr and not has_pandas() ): - pytest.skip("pandas>=1.0,<3.0 not installed") + pytest.skip("pandas not installed") else: pytest.fail(stderr) diff --git a/tiledb/tests/test_fixes.py b/tiledb/tests/test_fixes.py index 73d5611f5b..aa82d4f2e3 100644 --- a/tiledb/tests/test_fixes.py +++ b/tiledb/tests/test_fixes.py @@ -91,7 +91,7 @@ def test_ch8292(self): buffers = list(*q._get_buffers().values()) assert buffers[0].nbytes == max_val - @pytest.mark.skipif(not has_pandas(), reason="pandas>=1.0,<3.0 not installed") + @pytest.mark.skipif(not has_pandas(), reason="pandas not installed") def test_ch10282_concurrent_multi_index(self): """Test concurrent access to a single tiledb.Array using Array.multi_index and Array.df. We pass an array and slice @@ -230,7 +230,7 @@ def test_fix_stats_error_messages(self): @pytest.mark.skipif( not has_pandas() and has_pyarrow(), - reason="pandas>=1.0,<3.0 or pyarrow>=1.0 not installed", + reason="pandas or pyarrow>=1.0 not installed", ) def test_py1078_df_all_empty_strings(self): uri = self.path() @@ -246,7 +246,7 @@ def test_py1078_df_all_empty_strings(self): with tiledb.open(uri) as arr: tm.assert_frame_equal(arr.df[:], df) - @pytest.mark.skipif(not has_pandas(), reason="pandas>=1.0,<3.0 not installed") + @pytest.mark.skipif(not has_pandas(), reason="pandas not installed") @pytest.mark.parametrize("is_sparse", [True, False]) def test_sc1430_nonexisting_timestamp(self, is_sparse): path = self.path("nonexisting_timestamp") diff --git a/tiledb/tests/test_hypothesis.py b/tiledb/tests/test_hypothesis.py index 6494f420da..c73bb876cd 100644 --- a/tiledb/tests/test_hypothesis.py +++ b/tiledb/tests/test_hypothesis.py @@ -13,7 +13,7 @@ tm = pd._testing -@pytest.mark.skipif(not has_pandas(), reason="pandas>=1.0,<3.0 not installed") +@pytest.mark.skipif(not has_pandas(), reason="pandas not installed") @pytest.mark.parametrize("mode", ["np", "df"]) @hp.settings(deadline=None, verbosity=hp.Verbosity.verbose) @hp.given(st.binary()) diff --git a/tiledb/tests/test_libtiledb.py b/tiledb/tests/test_libtiledb.py index 07d852ba5b..38f04c5cf4 100644 --- a/tiledb/tests/test_libtiledb.py +++ b/tiledb/tests/test_libtiledb.py @@ -418,7 +418,7 @@ def test_array_delete(self): @pytest.mark.skipif( not has_pyarrow() or not has_pandas(), - reason="pyarrow>=1.0 and/or pandas>=1.0,<3.0 not installed", + reason="pyarrow>=1.0 and/or pandas not installed", ) @pytest.mark.parametrize("sparse", [True, False]) @pytest.mark.parametrize("pass_df", [True, False]) @@ -1784,7 +1784,7 @@ def test_query_real_multi_index(self, fx_sparse_cell_order): "coords" not in T.query(coords=False).multi_index[-10.0:5.0] ) - @pytest.mark.skipif(not has_pandas(), reason="pandas>=1.0,<3.0 not installed") + @pytest.mark.skipif(not has_pandas(), reason="pandas not installed") @pytest.mark.parametrize("dtype", ["u1", "u2", "u4", "u8", "i1", "i2", "i4", "i8"]) def test_sparse_index_dtypes(self, dtype): path = self.path() @@ -1805,7 +1805,7 @@ def test_sparse_index_dtypes(self, dtype): assert B[data[1]]["attr"] == data[1] assert B.multi_index[data[0]]["attr"] == data[0] - @pytest.mark.skipif(not has_pandas(), reason="pandas>=1.0,<3.0 not installed") + @pytest.mark.skipif(not has_pandas(), reason="pandas not installed") @pytest.mark.skipif( tiledb.libtiledb.version() < (2, 10), reason="TILEDB_BOOL introduced in libtiledb 2.10", @@ -3743,7 +3743,7 @@ def test_query_return_incomplete_error(self, sparse): with self.assertRaises(tiledb.TileDBError): A.query(return_incomplete=True)[:] - @pytest.mark.skipif(not has_pandas(), reason="pandas>=1.0,<3.0 not installed") + @pytest.mark.skipif(not has_pandas(), reason="pandas not installed") @pytest.mark.parametrize( "use_arrow, return_arrow, indexer", [ diff --git a/tiledb/tests/test_multi_index.py b/tiledb/tests/test_multi_index.py index 6f8c7ef166..1a6f713ca8 100644 --- a/tiledb/tests/test_multi_index.py +++ b/tiledb/tests/test_multi_index.py @@ -141,7 +141,7 @@ def make_arr(ndim): class TestMultiRange(DiskTestCase): @pytest.mark.skipif( not has_pyarrow() or not has_pandas(), - reason="pyarrow>=1.0 and/or pandas>=1.0,<3.0 not installed", + reason="pyarrow>=1.0 and/or pandas not installed", ) def test_return_arrow_indexers(self): uri = self.path("multirange_behavior_sparse") @@ -183,7 +183,7 @@ def test_return_arrow_indexers(self): @pytest.mark.skipif( not has_pyarrow() or not has_pandas(), - reason="pyarrow>=1.0 and/or pandas>=1.0,<3.0 not installed", + reason="pyarrow>=1.0 and/or pandas not installed", ) @pytest.mark.parametrize("sparse", [True, False]) def test_return_large_arrow_table(self, sparse): @@ -727,7 +727,7 @@ def test_fix_473_sparse_index_bug(self): np.array([], dtype=np.uint64), ) - @pytest.mark.skipif(not has_pandas(), reason="pandas>=1.0,<3.0 not installed") + @pytest.mark.skipif(not has_pandas(), reason="pandas not installed") def test_fixed_multi_attr_df(self): uri = self.path("test_fixed_multi_attr_df") dom = tiledb.Domain( @@ -761,7 +761,7 @@ def test_fixed_multi_attr_df(self): result = A.query(attrs=["111"], use_arrow=False) assert_array_equal(result.df[0]["111"], data_111) - @pytest.mark.skipif(not has_pandas(), reason="pandas>=1.0,<3.0 not installed") + @pytest.mark.skipif(not has_pandas(), reason="pandas not installed") def test_var_multi_attr_df(self): uri = self.path("test_var_multi_attr_df") dom = tiledb.Domain( @@ -845,7 +845,7 @@ def test_multi_index_open_timestamp_with_empty_nonempty_domain(self): assert A.nonempty_domain() is None assert_array_equal(A.multi_index[:][""], A[:][""]) - @pytest.mark.skipif(not has_pandas(), reason="pandas>=1.0,<3.0 not installed") + @pytest.mark.skipif(not has_pandas(), reason="pandas not installed") def test_multi_index_query_args(self): uri = self.path("test_multi_index_query_args") schema = tiledb.ArraySchema( @@ -871,7 +871,7 @@ def test_multi_index_query_args(self): assert_array_equal(q.multi_index[:]["a"], q.df[:]["a"]) assert all(q[:]["a"] >= 5) - @pytest.mark.skipif(not has_pandas(), reason="pandas>=1.0,<3.0 not installed") + @pytest.mark.skipif(not has_pandas(), reason="pandas not installed") def test_multi_index_timing(self): path = self.path("test_multi_index_timing") attr_name = "a" @@ -886,7 +886,7 @@ def test_multi_index_timing(self): assert "py.getitem_time.pandas_index_update_time :" in internal_stats tiledb.stats_disable() - @pytest.mark.skipif(not has_pandas(), reason="pandas>=1.0,<3.0 not installed") + @pytest.mark.skipif(not has_pandas(), reason="pandas not installed") def test_fixed_width_char(self): uri = self.path("test_fixed_width_char") schema = tiledb.ArraySchema( @@ -903,7 +903,7 @@ def test_fixed_width_char(self): with tiledb.open(uri, mode="r") as A: assert all(A.query(use_arrow=True).df[:][""] == data) - @pytest.mark.skipif(not has_pandas(), reason="pandas>=1.0,<3.0 not installed") + @pytest.mark.skipif(not has_pandas(), reason="pandas not installed") def test_empty_idx(self): uri = self.path("test_empty_idx") diff --git a/tiledb/tests/test_pandas_dataframe.py b/tiledb/tests/test_pandas_dataframe.py index fa4350d8d6..144abe98b3 100644 --- a/tiledb/tests/test_pandas_dataframe.py +++ b/tiledb/tests/test_pandas_dataframe.py @@ -29,7 +29,7 @@ from .datatypes import RaggedDtype if not has_pandas(): - pytest.skip("pandas>=1.0,<3.0 not installed", allow_module_level=True) + pytest.skip("pandas not installed", allow_module_level=True) else: import pandas as pd @@ -212,24 +212,24 @@ def test_object_dtype(self): for s in ["hello", b"world"], ["hello", 1], [b"hello", 1]: pytest.raises(NotImplementedError, ColumnInfo.from_values, pd.Series(s)) + def test_string_dtype(self): + # Auto-inferred str type: non-nullable when data has no nulls + info = ColumnInfo.from_values(pd.Series(["hello", "world"])) + assert info.dtype == np.dtype("