From 8d41a17f779f119c2a104036804aacad019c864f Mon Sep 17 00:00:00 2001 From: ajpotts Date: Thu, 29 Jan 2026 13:29:48 -0500 Subject: [PATCH] Closes #5371: Add locate and from_return_msg to pd Series accessor --- arkouda/numpy/pdarraycreation.py | 6 +- arkouda/pandas/extension/_arkouda_array.py | 11 ++ .../extension/_arkouda_extension_array.py | 18 ++- arkouda/pandas/extension/_series_accessor.py | 95 ++++++++++++ arkouda/pandas/series.py | 141 ++++++++++++++---- tests/pandas/extension/series_accessor.py | 63 ++++++++ 6 files changed, 301 insertions(+), 33 deletions(-) diff --git a/arkouda/numpy/pdarraycreation.py b/arkouda/numpy/pdarraycreation.py index 1948ddcc493..f4c1af498b4 100644 --- a/arkouda/numpy/pdarraycreation.py +++ b/arkouda/numpy/pdarraycreation.py @@ -329,7 +329,9 @@ def array( # If a is not already a numpy.ndarray, convert it if not isinstance(a, np.ndarray): try: - if dtype is not None and dtype != bigint: + from arkouda.numpy.dtypes import dtype as ak_dtype + + if dtype is not None and ak_dtype(dtype) != "bigint": # if the user specified dtype, use that dtype a = np.array(a, dtype=dtype) elif ( @@ -544,7 +546,7 @@ def _bigint_from_numpy( pass else: if not isinstance(x, (int, float, np.integer, np.floating)): - raise TypeError("bigint requires numeric input, got non-numeric object") + raise TypeError(f"bigint requires numeric input, got {type(x)}") # Handle floats: bigint-from-float behaves like existing code (cast to float64). # Note: we only need to know *that* a float exists, not which values. diff --git a/arkouda/pandas/extension/_arkouda_array.py b/arkouda/pandas/extension/_arkouda_array.py index 1a4b81bcf3e..0da902a6ac3 100644 --- a/arkouda/pandas/extension/_arkouda_array.py +++ b/arkouda/pandas/extension/_arkouda_array.py @@ -114,6 +114,17 @@ def __init__( def _from_sequence(cls, scalars, dtype=None, copy=False): from arkouda.numpy.pdarraycreation import array as ak_array + # normalize dtype input + if ( + dtype is not None + and ( + getattr(dtype, "name", None) in {"bigint", "ak.bigint"} + or str(dtype) in {"bigint", "ak.bigint"} + ) + or dtype is ArkoudaBigintDtype + ): + dtype = "bigint" + # If pandas passes our own EA dtype, ignore it and infer from data if isinstance(dtype, _ArkoudaBaseDtype): dtype = dtype.numpy_dtype diff --git a/arkouda/pandas/extension/_arkouda_extension_array.py b/arkouda/pandas/extension/_arkouda_extension_array.py index 6797de3fe19..67d396c362f 100644 --- a/arkouda/pandas/extension/_arkouda_extension_array.py +++ b/arkouda/pandas/extension/_arkouda_extension_array.py @@ -442,6 +442,15 @@ def take(self, indexer, fill_value=None, allow_fill=False): if fill_value is None: fill_value = self.default_fill_value + # Pandas commonly passes fill_value=-1 for integer-like data when allow_fill=True. + # That cannot be represented in unsigned dtypes; fall back to a safe fill. + try: + if hasattr(self._data.dtype, "kind") and self._data.dtype.kind == "u" and fill_value < 0: + fill_value = 0 + except TypeError: + # Non-numeric fill_value; let the cast below raise a sensible error + pass + # cast once to ensure dtype match fv = self._data.dtype.type(fill_value) @@ -454,7 +463,14 @@ def take(self, indexer, fill_value=None, allow_fill=False): if oob.any(): raise IndexError("indexer out of bounds in take with allow_fill=True") - gathered = ak.where(mask, fv, self._data[idx_fix]) + data = self._data[idx_fix] + + if self._data.dtype == ak.bigint: + gathered = data.copy() + gathered[mask] = fill_value + else: + gathered = ak.where(mask, fv, data) + return type(self)(gathered) def factorize(self, use_na_sentinel=True) -> Tuple[NDArray[np.intp], "ArkoudaExtensionArray"]: diff --git a/arkouda/pandas/extension/_series_accessor.py b/arkouda/pandas/extension/_series_accessor.py index 65dc1e71c8a..45ec6bb9e31 100644 --- a/arkouda/pandas/extension/_series_accessor.py +++ b/arkouda/pandas/extension/_series_accessor.py @@ -377,6 +377,101 @@ def is_arkouda(self) -> bool: idx_arr = self._obj.index.values return isinstance(arr, ArkoudaExtensionArray) and isinstance(idx_arr, ArkoudaExtensionArray) + # ------------------------------------------------------------------ + # Legacy delegation: thin wrappers over ak.Series + # ------------------------------------------------------------------ + + def locate(self, key: object) -> pd.Series: + """ + Lookup values by index label on the Arkouda server. + + This is a thin wrapper around the legacy :meth:`arkouda.pandas.series.Series.locate` + method. It converts the pandas Series to a legacy Arkouda ``ak.Series``, + performs the locate operation on the server, and wraps the result back into + an Arkouda-backed pandas Series (ExtensionArray-backed) without NumPy + materialization. + + Parameters + ---------- + key : object + Lookup key or keys. Interpreted in the same way as the legacy Arkouda + ``Series.locate`` method. This may be: + - a scalar + - a list/tuple of scalars + - an Arkouda ``pdarray`` + - an Arkouda ``Index`` / ``MultiIndex`` + - an Arkouda ``Series`` (special case: preserves key index) + + Returns + ------- + pd.Series + A pandas Series backed by Arkouda ExtensionArrays containing the located + values. The returned Series remains distributed (no NumPy materialization) + and is sorted by index. + + Notes + ----- + * This method is Arkouda-specific; pandas does not define ``Series.locate``. + * If ``key`` is a pandas Index/MultiIndex, consider converting it via + ``key.ak.to_ak_legacy()`` before calling ``locate`` for the most direct path. + + Examples + -------- + >>> import arkouda as ak + >>> import pandas as pd + >>> s = pd.Series([10, 20, 30], index=pd.Index([1, 2, 3])).ak.to_ak() + >>> out = s.ak.locate([3, 1]) + >>> out.tolist() + [np.int64(10), np.int64(30)] + """ + # Lift the pandas Series into a legacy Arkouda Series + aks = self.to_ak_legacy() + + # Normalize common pandas key types to legacy Arkouda where possible + # (mirrors the “thin wrapper” approach used in the Index accessor). + if isinstance(key, (pd.Index, pd.MultiIndex)): + key = ArkoudaIndexAccessor(key).to_ak_legacy() + elif isinstance(key, pd.Series): + # For pandas Series keys, convert to legacy ak.Series to preserve key.index semantics. + key = ArkoudaSeriesAccessor(key).to_ak_legacy() + + out_ak = aks.locate(key) + + # Wrap result into an Arkouda-backed pandas Series + # (keep the returned index/name from the legacy result). + idx = ArkoudaIndexAccessor(out_ak.index.to_pandas()).to_ak() # preserve names/levels + return _ak_array_to_pandas_series(out_ak.values, name=out_ak.name).set_axis(idx) + + @staticmethod + def _from_return_msg(rep_msg: str) -> pd.Series: + """ + Construct a pandas Series from a legacy Arkouda return message. + + This mirrors :meth:`ArkoudaIndexAccessor.from_return_msg`. It calls the legacy + ``ak.Series.from_return_msg`` constructor, then immediately wraps the resulting + Arkouda arrays back into a pandas Series backed by Arkouda ExtensionArrays. + + Parameters + ---------- + rep_msg : str + The ``+ delimited`` string message returned by Arkouda server operations + that construct a Series. + + Returns + ------- + pd.Series + A pandas Series backed by Arkouda ExtensionArrays (no NumPy materialization). + """ + ak_s = ak_Series.from_return_msg(rep_msg) + + # Wrap values and index without materializing + out = _ak_array_to_pandas_series(ak_s.values, name=ak_s.name) + + # Prefer wrapping the legacy index via the Index accessor helper path. + # We need a pandas Index/MultiIndex backed by EAs. + pd_idx = ArkoudaIndexAccessor.from_ak_legacy(ak_s.index) + return pd.Series(out.array, index=pd_idx, name=ak_s.name) + def groupby(self) -> GroupBy: """ Return an Arkouda GroupBy object for this Series, without materializing. diff --git a/arkouda/pandas/series.py b/arkouda/pandas/series.py index 4c9ad828cfd..71351b7d56a 100644 --- a/arkouda/pandas/series.py +++ b/arkouda/pandas/series.py @@ -27,7 +27,7 @@ from arkouda.categorical import Categorical from arkouda.numpy import cast as akcast from arkouda.numpy.alignment import lookup - from arkouda.numpy.pdarraycreation import arange, array, zeros + from arkouda.numpy.pdarraycreation import arange, zeros from arkouda.numpy.segarray import SegArray from arkouda.numpy.strings import Strings else: @@ -677,38 +677,119 @@ def locate(self, key: Union[int, pdarray, Index, Series, List, Tuple]) -> Series A Series containing the values corresponding to the key. """ + from arkouda.numpy.pdarraycreation import array + + def is_scalar_label(x) -> bool: + # scalar label component (NOT array-like) + return not isinstance(x, (pdarray, Index, Series, list, tuple)) + + def to_pdarray(obj) -> pdarray: + """ + Convert without ever touching pandas/numpy containers. + Assumes Arkouda Index/Series store pdarrays internally. + """ + if isinstance(obj, pdarray): + return obj + if isinstance(obj, Index): + # Arkouda Index wrapper: underlying pdarray is on .index + return obj.index + if isinstance(obj, Series): + # Arkouda Series wrapper: underlying pdarray is on .values + values = obj.values + if isinstance(values, pdarray): + return values + # python scalar / python list/tuple -> arkouda pdarray (server-side) + return array(obj) + + def rebuild_mi_with_names(mi: MultiIndex, names) -> MultiIndex: + # Do not rely on mi.names setter being functional + return MultiIndex(mi.levels, names=list(names)) + + def finalize(selector) -> Series: + out_index = self.index[selector] + if isinstance(out_index, MultiIndex) and isinstance(self.index, MultiIndex): + out_index = rebuild_mi_with_names(out_index, self.index.names) + return Series(index=out_index, data=self.values[selector]) + + # ---- Series key: preserve its index, lookup by its values (Arkouda Series) if isinstance(key, Series): - # special case, keep the index values of the Series, and lookup the values return Series(index=key.index, data=lookup(self.index.index, self.values, key.values)) - elif isinstance(key, MultiIndex): - idx = self.index.lookup(key.index) - elif isinstance(key, Index): - idx = self.index.lookup(key.index) - elif isinstance(key, pdarray): - idx = self.index.lookup(key) - elif isinstance(key, (list, tuple)): + + # ---- Direct index objects + if isinstance(key, MultiIndex): + return finalize(self.index.lookup(key.index)) + + if isinstance(key, Index): + return finalize(self.index.lookup(key.index)) + + # ---- pdarray key + if isinstance(key, pdarray): + return finalize(self.index.lookup(key)) + + # ---- list/tuple keys + if isinstance(key, (list, tuple)): + if isinstance(self.index, MultiIndex): + nlevels = self.index.nlevels + + if len(key) != nlevels: + raise TypeError( + "For MultiIndex Series, 'key' must be a tuple label, an Index/MultiIndex, " + "or per-level keys with length equal to nlevels." + ) + + # Reject flat list-of-scalars like [0, 2] + if isinstance(key, list): + all_scalar = True + for k in key: + if not is_scalar_label(k): + all_scalar = False + break + if all_scalar: + raise TypeError( + "For MultiIndex Series, a single label must be a tuple, e.g. (0, 2), " + "not a flat list like [0, 2]." + ) + + # Single scalar label tuple: (0, 10) -> per-level length-1 arrays + if isinstance(key, tuple): + all_scalar = True + for k in key: + if not is_scalar_label(k): + all_scalar = False + break + if all_scalar: + per_level = [array([k]) for k in key] + return finalize(self.index.lookup(per_level)) + + # Per-level keys: normalize each element without pandas/numpy + per_level = [to_pdarray(k) for k in key] + + # Enforce paired selection: equal sizes (metadata only) + sizes = [int(k.size) for k in per_level] + if len(set(sizes)) != 1: + raise ValueError( + f"Per-level MultiIndex keys must have the same length; got {sizes}." + ) + + return finalize(self.index.lookup(per_level)) + + # Non-MultiIndex: + # - list of scalars -> convert to pdarray + # - nested list/tuple -> transpose using pure Python (keys only) key0 = key[0] - if isinstance(key0, list) or isinstance(key0, tuple): - # nested list. check if already arkouda arrays - if not isinstance(key0[0], pdarray): - # convert list of lists to list of pdarrays - key = [array(a) for a in np.array(key).T.copy()] - - elif not isinstance(key0, pdarray): - # a list of scalers, convert into arkouda array - try: - val = array(key) - if isinstance(val, pdarray): - key = val - except Exception: - raise TypeError("'key' parameter must be convertible to pdarray") - - # else already list if arkouda array, use as is - idx = self.index.lookup(key) - else: - # scalar value - idx = self.index == key - return Series(index=self.index[idx], data=self.values[idx]) + + if isinstance(key0, (list, tuple)): + cols = list(zip(*key)) + per_level = [array(col) for col in cols] # col is a tuple of python scalars + return finalize(self.index.lookup(per_level)) + + if isinstance(key0, pdarray): + return finalize(self.index.lookup(key)) + + return finalize(self.index.lookup(to_pdarray(key))) + + # ---- scalar key + return finalize(self.index == key) @classmethod def _make_binop(cls, operator): diff --git a/tests/pandas/extension/series_accessor.py b/tests/pandas/extension/series_accessor.py index 34fedeb2056..2d59262f46c 100644 --- a/tests/pandas/extension/series_accessor.py +++ b/tests/pandas/extension/series_accessor.py @@ -13,6 +13,7 @@ _ak_array_to_pandas_series, _pandas_series_to_ak_array, ) +from arkouda.pandas.index import MultiIndex from arkouda.pandas.series import Series as ak_Series @@ -22,6 +23,7 @@ def _assert_series_equal_values(s: pd.Series, values): class TestArkoudaSeriesAccessor: + @pytest.mark.requires_chapel_module("In1dMsg") def test_series_accessor_docstrings(self): import doctest @@ -249,6 +251,28 @@ def test_ak_array_to_pandas_series_preserves_provided_index_and_makes_it_arkouda # Data correct assert np.array_equal(s.to_numpy(), np.array([10, 20, 30, 40])) + def test_series_locate_multiindex_accepts_per_level_pdarray_keys(self): + # Build MultiIndex Series + lvl0 = ak.array([0, 0, 1, 1]) + lvl1 = ak.array([10, 11, 10, 11]) + mi = MultiIndex([lvl0, lvl1], names=["a", "b"]) + + vals = ak.array([100, 101, 102, 103]) + s = Series(vals, index=mi) + + # Per-level keys (pdarrays) — should be supported + k0 = ak.array([0, 1]) + k1 = ak.array([10, 11]) + + # Expected: select (0,10) and (1,11) in that order -> [100, 103] + out = s.locate((k0, k1)) + out = s.locate([k0, k1]) + + # Compare values (and optionally index) + assert out.tolist() == [100, 103] + assert out.index.nlevels == 2 + assert out.index.names == ["a", "b"] + class TestArkoudaSeriesGroupby: def test_series_ak_groupby_raises_if_not_arkouda_backed(self): @@ -287,3 +311,42 @@ def test_series_ak_groupby_raises_if_underlying_array_missing_data(self): with pytest.raises(TypeError, match=r"Arkouda-backed Series array does not expose '_data'"): _ = s.ak.groupby() + + # ------------------------------------------------------------------ + # locate + # ------------------------------------------------------------------ + + @pytest.mark.requires_chapel_module("In1dMsg") + @pytest.mark.parametrize("dtype", ["int64", "uint64", "bool_", "bigint"]) + @pytest.mark.parametrize("dtype_index", ["ak_int64", "ak_uint64"]) + def test_locate(self, dtype, dtype_index): + pda = pd.array(ak.arange(3, dtype=dtype), dtype="ak." + dtype) + pda2 = pd.array(ak.array(["A", "B", "C"]), dtype="ak_str") + idx = pd.array(ak.arange(3), dtype=dtype_index) + for val in pda, pda2: + s = pd.Series(val, index=idx).ak.to_ak() + + for key in ( + 1, + pd.Index([1], dtype=dtype_index), + pd.Index([0, 2], dtype=dtype_index), + ): + lk = s.ak.locate(key) + assert isinstance(lk, pd.Series) + key = ak.array(key) if not isinstance(key, int) else key + assert (lk.index == s.index[key]).all() + assert (lk.values == s.values[key]).all() + + # testing multi-index lookup + mi = pd.MultiIndex.from_arrays([pda, pda[::-1]]) + s = pd.Series(data=val, index=mi) + lk = s.ak.locate(mi[0]) + assert isinstance(lk, pd.Series) + assert lk.values[0] == val[0] + + # ensure error with scalar and multi-index + with pytest.raises(TypeError): + s.ak.locate(0) + + with pytest.raises(TypeError): + s.ak.locate([0, 2])