Clean up binning by using pd.Index for expected_groups

dcherian · dcherian · commit 8497cc917b01 · 2022-02-14T16:12:43.000-07:00
diff --git a/flox/core.py b/flox/core.py
@@ -58,13 +58,14 @@ def _prepare_for_flox(group_idx, array):
     return group_idx, ordered_array
 
 
-def _get_expected_groups(by, raise_if_dask=True) -> np.ndarray | None:
+def _get_expected_groups(by, raise_if_dask=True) -> pd.Index | None:
     if is_duck_dask_array(by):
         if raise_if_dask:
             raise ValueError("Please provide `expected_groups`.")
         return None
     flatby = by.ravel()
-    return np.unique(flatby[~isnull(flatby)])
+    expected = np.unique(flatby[~isnull(flatby)])
+    return _convert_expected_groups_to_index(expected, isbin=False)
 
 
 def _get_chunk_reduction(reduction_type: str) -> Callable:
@@ -324,31 +325,28 @@ def rechunk_for_blockwise(array, axis, labels):
 
 def reindex_(array: np.ndarray, from_, to, fill_value=None, axis: int = -1) -> np.ndarray:
 
+    assert isinstance(to, pd.Index)
     assert axis in (0, -1)
 
-    from_ = np.atleast_1d(from_)
-    to = np.atleast_1d(to)
-
     if to.ndim > 1:
         raise ValueError(f"Cannot reindex to a multidimensional array: {to}")
 
-    # short-circuit for trivial case
-    if len(from_) == len(to) and np.all(from_ == to):
-        return array
-
     if array.shape[axis] == 0:
         # all groups were NaN
         reindexed = np.full(array.shape[:-1] + (len(to),), fill_value, dtype=array.dtype)
         return reindexed
 
+    from_ = pd.Index(from_)
+    # short-circuit for trivial case
+    if from_.equals(to):
+        return array
+
     if from_.dtype.kind == "O" and isinstance(from_[0], tuple):
         raise NotImplementedError(
             "Currently does not support reindexing with object arrays of tuples. "
             "These occur when grouping by multi-indexed variables in xarray."
         )
-    idx = np.array(
-        [np.argwhere(np.array(from_) == label)[0, 0] if label in from_ else -1 for label in to]
-    )
+    idx = from_.get_indexer(to)
     indexer = [slice(None, None)] * array.ndim
     indexer[axis] = idx  # type: ignore
     reindexed = array[tuple(indexer)]
@@ -384,29 +382,25 @@ def offset_labels(labels: np.ndarray, ngroups: int) -> tuple[np.ndarray, int]:
     return offset, size
 
 
-def factorize_(by: tuple, axis, expected_groups: tuple = None, isbin: tuple = None):
+def factorize_(by: tuple, axis, expected_groups: tuple[pd.Index, ...] = None):
     if not isinstance(by, tuple):
         raise ValueError(f"Expected `by` to be a tuple. Received {type(by)} instead")
 
-    if isbin is None:
-        isbin = (False,) * len(by)
     if expected_groups is None:
         expected_groups = (None,) * len(by)
 
     factorized = []
     found_groups = []
-    for groupvar, expect, tobin in zip(by, expected_groups, isbin):
-        if tobin:
+    for groupvar, expect in zip(by, expected_groups):
+        if isinstance(expect, pd.IntervalIndex):
             # when binning we change expected groups to integers marking the interval
             # this makes the reindexing logic simpler.
             if expect is None:
                 raise ValueError("Please pass bin edges in expected_groups.")
             # idx = np.digitize(groupvar.ravel(), expect) - 1
-            idx = pd.cut(groupvar.ravel(), bins=expect, labels=False)
+            idx = pd.cut(groupvar.ravel(), bins=expect, labels=False).codes.copy()
             # same sentinel value as factorize
-            idx[np.isnan(idx)] = -1
-            idx = idx.astype(int, copy=False)
-            found_groups.append(np.arange(len(expect) - 1))
+            found_groups.append(expect)
         else:
             idx, groups = pd.factorize(groupvar.ravel())
             found_groups.append(np.array(groups))
@@ -447,12 +441,11 @@ def chunk_argreduce(
     array_plus_idx: tuple[np.ndarray, ...],
     by: np.ndarray,
     func: Sequence[str],
-    expected_groups: Sequence | np.ndarray | None,
+    expected_groups: pd.Index | None,
     axis: int | Sequence[int],
     fill_value: Mapping[str | Callable, Any],
     dtype=None,
     reindex: bool = False,
-    isbin: bool = False,
     engine: str = "numpy",
 ) -> IntermediateDict:
     """
@@ -470,7 +463,6 @@ def chunk_argreduce(
         expected_groups=None,
         axis=axis,
         fill_value=fill_value,
-        isbin=isbin,
         dtype=dtype,
         engine=engine,
     )
@@ -493,12 +485,11 @@ def chunk_reduce(
     array: np.ndarray,
     by: np.ndarray,
     func: str | Callable | Sequence[str] | Sequence[Callable],
-    expected_groups: Sequence | np.ndarray = None,
+    expected_groups: pd.Index | None,
     axis: int | Sequence[int] = None,
     fill_value: Mapping[str | Callable, Any] = None,
     dtype=None,
     reindex: bool = False,
-    isbin: bool = False,
     engine: str = "numpy",
     kwargs=None,
 ) -> IntermediateDict:
@@ -574,12 +565,9 @@ def chunk_reduce(
     # indices=[0,0,0]. This is necessary when combining block results
     # factorize can handle strings etc unlike digitize
     group_idx, groups, _, ngroups, size, props = factorize_(
-        (by,), axis, expected_groups=(expected_groups,), isbin=(isbin,)
+        (by,), axis, expected_groups=(expected_groups,)
     )
     groups = groups[0]
-    # TODO: why?
-    if isbin:
-        expected_groups = groups
 
     # always reshape to 1D along group dimensions
     newshape = array.shape[: array.ndim - by.ndim] + (np.prod(array.shape[-by.ndim :]),)
@@ -590,7 +578,8 @@ def chunk_reduce(
 
     results: IntermediateDict = {"groups": [], "intermediates": []}
     if reindex and expected_groups is not None:
-        results["groups"] = np.array(expected_groups)
+        # TODO: what happens with binning here?
+        results["groups"] = expected_groups.values
     else:
         if empty:
             results["groups"] = np.array([np.nan])
@@ -654,16 +643,13 @@ def chunk_reduce(
             if props.offset_group:
                 result = result.reshape(*final_array_shape[:-1], ngroups)
             if reindex:
-                if not isbin:
-                    result = reindex_(result, groups, expected_groups, fill_value=fv)
+                result = reindex_(result, groups, expected_groups, fill_value=fv)
             else:
                 result = result[..., sortidx]
             result = result.reshape(final_array_shape)
         results["intermediates"].append(result)
-    if final_groups_shape:
-        # This happens when to_group is broadcasted, and we reduce along the broadcast
-        # dimension
-        results["groups"] = np.broadcast_to(results["groups"], final_groups_shape)
+
+    results["groups"] = np.broadcast_to(results["groups"], final_groups_shape)
     return results
 
 
@@ -691,7 +677,7 @@ def _finalize_results(
     results: IntermediateDict,
     agg: Aggregation,
     axis: Sequence[int],
-    expected_groups: Sequence | np.ndarray | None,
+    expected_groups: pd.Index | None,
     fill_value: Any,
 ):
     """Finalize results by
@@ -731,7 +717,7 @@ def _finalize_results(
         finalized[agg.name] = reindex_(
             finalized[agg.name], squeezed["groups"], expected_groups, fill_value=fill_value
         )
-        finalized["groups"] = expected_groups
+        finalized["groups"] = expected_groups.to_numpy()
     else:
         finalized["groups"] = squeezed["groups"]
 
@@ -742,7 +728,7 @@ def _aggregate(
     x_chunk,
     combine: Callable,
     agg: Aggregation,
-    expected_groups: Sequence | np.ndarray | None,
+    expected_groups: pd.Index | None,
     axis: Sequence,
     keepdims,
     fill_value: Any,
@@ -796,7 +782,9 @@ def reindex_intermediates(x, agg, unique_groups):
     new_shape = x["groups"].shape[:-1] + (len(unique_groups),)
     newx = {"groups": np.broadcast_to(unique_groups, new_shape)}
     newx["intermediates"] = tuple(
-        reindex_(v, from_=x["groups"].squeeze(), to=unique_groups, fill_value=f)
+        reindex_(
+            v, from_=np.atleast_1d(x["groups"].squeeze()), to=pd.Index(unique_groups), fill_value=f
+        )
         for v, f in zip(x["intermediates"], agg.fill_value["intermediate"])
     )
     return newx
@@ -940,7 +928,7 @@ def split_blocks(applied, split_out, expected_groups, split_name):
     return intermediate, group_chunks
 
 
-def _reduce_blockwise(array, by, agg, *, axis, expected_groups, fill_value, isbin, engine):
+def _reduce_blockwise(array, by, agg, *, axis, expected_groups, fill_value, engine):
     """
     Blockwise groupby reduction that produces the final result. This code path is
     also used for non-dask array aggregations.
@@ -964,13 +952,12 @@ def _reduce_blockwise(array, by, agg, *, axis, expected_groups, fill_value, isbi
         by,
         func=func,
         axis=axis,
-        expected_groups=expected_groups if isbin else None,
+        expected_groups=expected_groups,
         # This fill_value should only apply to groups that only contain NaN observations
         # BUT there is funkiness when axis is a subset of all possible values
         # (see below)
         fill_value=(agg.fill_value[agg.name], 0),
         dtype=(agg.dtype[agg.name], np.intp),
-        isbin=isbin,
         kwargs=finalize_kwargs,
         engine=engine,
     )  # type: ignore
@@ -995,9 +982,6 @@ def _reduce_blockwise(array, by, agg, *, axis, expected_groups, fill_value, isbi
         value[mask] = np.nan
         results["intermediates"][0] = value
 
-    if isbin:
-        expected_groups = np.arange(len(expected_groups) - 1)
-
     # When axis is a subset of possible values; then npg will
     # apply it to groups that don't exist along a particular axis (for e.g.)
     # since these count as a group that is absent. thoo!
@@ -1022,12 +1006,11 @@ def dask_groupby_agg(
     array: DaskArray,
     by: DaskArray | np.ndarray,
     agg: Aggregation,
-    expected_groups: Sequence | np.ndarray | None,
+    expected_groups: pd.Index | None,
     axis: Sequence = None,
     split_out: int = 1,
     fill_value: Any = None,
     method: str = "map-reduce",
-    isbin: bool = False,
     reindex: bool = False,
     engine: str = "numpy",
 ) -> tuple[DaskArray, np.ndarray | DaskArray]:
@@ -1107,8 +1090,7 @@ def dask_groupby_agg(
         partial(
             blockwise_method,
             axis=axis,
-            expected_groups=expected_groups if reindex or split_out > 1 or isbin else None,
-            isbin=isbin,
+            expected_groups=expected_groups,
             engine=engine,
         ),
         inds,
@@ -1129,9 +1111,6 @@ def dask_groupby_agg(
         )
     else:
         intermediate = applied
-        # from this point on, we just work with bin indexes when binning
-        if isbin:
-            expected_groups = np.arange(len(expected_groups) - 1)
         if expected_groups is None:
             expected_groups = _get_expected_groups(by_input, raise_if_dask=False)
         group_chunks = (len(expected_groups),) if expected_groups is not None else (np.nan,)
@@ -1212,7 +1191,7 @@ def dask_groupby_agg(
         if method == "map-reduce":
             if expected_groups is None:
                 expected_groups = _get_expected_groups(by_input)
-            groups = (expected_groups,)
+            groups = (expected_groups.values,)
         else:
             groups = (np.concatenate(groups_in_block),)
 
@@ -1269,12 +1248,22 @@ def _assert_by_is_aligned(shape, by):
         )
 
 
+def _convert_expected_groups_to_index(expected_groups, isbin: bool) -> pd.Index | None:
+    if isinstance(expected_groups, pd.Index):
+        return expected_groups
+    if isbin:
+        return pd.IntervalIndex.from_arrays(expected_groups[:-1], expected_groups[1:])
+    elif expected_groups is not None:
+        return pd.Index(expected_groups)
+    return None
+
+
 def groupby_reduce(
     array: np.ndarray | DaskArray,
     by: np.ndarray | DaskArray,
     func: str | Aggregation,
     *,
-    expected_groups: Sequence | np.ndarray = None,
+    expected_groups: Sequence | np.ndarray | None = None,
     sort: bool = True,
     isbin: bool = False,
     axis=None,
@@ -1393,6 +1382,10 @@ def groupby_reduce(
 
     _assert_by_is_aligned(array.shape, by)
 
+    # We convert to pd.Index since that lets us know if we are binning or not
+    # (pd.IntervalIndex or not)
+    expected_groups = _convert_expected_groups_to_index(expected_groups, isbin)
+
     if axis is None:
         axis = tuple(array.ndim + np.arange(-by.ndim, 0))
     else:
@@ -1435,7 +1428,7 @@ def groupby_reduce(
         assert isinstance(finalize_kwargs, dict)
         agg.finalize_kwargs = finalize_kwargs
 
-    kwargs = dict(axis=axis, fill_value=fill_value, isbin=isbin, engine=engine)
+    kwargs = dict(axis=axis, fill_value=fill_value, engine=engine)
 
     if not is_duck_dask_array(array) and not is_duck_dask_array(by):
         results = _reduce_blockwise(array, by, agg, expected_groups=expected_groups, **kwargs)
@@ -1484,7 +1477,7 @@ def groupby_reduce(
                 r, *g = partial_agg(
                     array_subset,
                     by[np.ix_(*indexer)],
-                    expected_groups=cohort,
+                    expected_groups=pd.Index(cohort),
                     # reindex to expected_groups at the blockwise step.
                     # this approach avoids replacing non-cohort members with
                     # np.nan or some other sentinel value, and preserves dtypes
@@ -1504,7 +1497,6 @@ def groupby_reduce(
             if method == "blockwise" and by.ndim == 1:
                 array = rechunk_for_blockwise(array, axis=-1, labels=by)
 
-            # TODO: test with mixed array kinds (numpy array + dask by)
             result, *groups = partial_agg(
                 array, by, expected_groups=expected_groups, reindex=reindex, method=method
             )
diff --git a/flox/xarray.py b/flox/xarray.py
@@ -259,7 +259,6 @@ def xarray_reduce(
             tuple(g.data for g in by),
             axis,
             expected_groups,
-            isbin,
         )
         to_group = xr.DataArray(group_idx, dims=dim, coords={d: by[0][d] for d in by[0].indexes})
     else:
@@ -307,7 +306,7 @@ def wrapper(array, to_group, *, func, skipna, **kwargs):
             reindexed = reindex_(
                 result,
                 from_=groups,
-                to=np.arange(np.prod(group_shape)),
+                to=pd.Index(np.arange(np.prod(group_shape))),
                 fill_value=fill_value,
                 axis=-1,
             )
diff --git a/tests/__init__.py b/tests/__init__.py
@@ -3,8 +3,11 @@
 from distutils import version
 
 import numpy as np
+import pandas as pd
 import pytest
 
+pd_types = (pd.Index,)
+
 try:
     import dask
     import dask.array as da
@@ -84,7 +87,9 @@ def assert_equal(a, b):
         a = np.array(a)
     if isinstance(b, list):
         b = np.array(b)
-    if has_xarray and isinstance(a, xr_types) or isinstance(b, xr_types):
+    if isinstance(a, pd_types) or isinstance(b, pd_types):
+        pd.testing.assert_index_equal(a, b)
+    elif has_xarray and isinstance(a, xr_types) or isinstance(b, xr_types):
         xr.testing.assert_identical(a, b)
     elif has_dask and isinstance(a, dask_array_type) or isinstance(b, dask_array_type):
         # does some validation of the dask graph
diff --git a/tests/test_core.py b/tests/test_core.py