Use nanlen instead of _count

dcherian · dcherian · commit a13ca36593d2 · 2021-10-04T07:31:11.000+05:30
diff --git a/dask_groupby/aggregations.py b/dask_groupby/aggregations.py
@@ -116,22 +116,8 @@ def nansum_of_squares(group_idx, array, size=None, fill_value=None):
     return sum_of_squares(group_idx, array, func="nansum", size=size, fill_value=fill_value)
 
 
-def _count(group_idx, array, size=None, fill_value=None):
-    import numpy_groupies as npg
-
-    return npg.aggregate_numpy.aggregate(
-        group_idx,
-        ~np.isnan(array),
-        axis=-1,
-        func="sum",
-        size=size,
-        fill_value=fill_value,
-        dtype=np.intp,
-    )
-
-
 count = Aggregation(
-    "count", chunk=_count, combine="sum", fill_value=0, final_fill_value=0, dtype=int
+    "count", chunk="nanlen", combine="sum", fill_value=0, final_fill_value=0, dtype=np.intp
 )
 
 # note that the fill values are  the result of np.func([np.nan, np.nan])
@@ -141,15 +127,15 @@ def _count(group_idx, array, size=None, fill_value=None):
 nanprod = Aggregation("nanprod", chunk="nanprod", combine="prod", fill_value=1, final_fill_value=1)
 mean = Aggregation(
     "mean",
-    chunk=("sum", _count),
+    chunk=("sum", "nanlen"),
     combine=("sum", "sum"),
     finalize=lambda sum_, count: sum_ / count,
     fill_value=(0, 0),
     dtype=np.float64,
 )
 nanmean = Aggregation(
     "nanmean",
-    chunk=("nansum", _count),
+    chunk=("nansum", "nanlen"),
     combine=("sum", "sum"),
     finalize=lambda sum_, count: sum_ / count,
     fill_value=(0, 0),
@@ -171,7 +157,7 @@ def _std_finalize(sumsq, sum_, count, ddof=0):
 # var, std always promote to float, so we set nan
 var = Aggregation(
     "var",
-    chunk=(sum_of_squares, "sum", _count),
+    chunk=(sum_of_squares, "sum", "nanlen"),
     combine=("sum", "sum", "sum"),
     finalize=_var_finalize,
     fill_value=0,
@@ -180,7 +166,7 @@ def _std_finalize(sumsq, sum_, count, ddof=0):
 )
 nanvar = Aggregation(
     "nanvar",
-    chunk=(nansum_of_squares, "nansum", _count),
+    chunk=(nansum_of_squares, "nansum", "nanlen"),
     combine=("sum", "sum", "sum"),
     finalize=_var_finalize,
     fill_value=0,
@@ -189,7 +175,7 @@ def _std_finalize(sumsq, sum_, count, ddof=0):
 )
 std = Aggregation(
     "std",
-    chunk=(sum_of_squares, "sum", _count),
+    chunk=(sum_of_squares, "sum", "nanlen"),
     combine=("sum", "sum", "sum"),
     finalize=_std_finalize,
     fill_value=0,
@@ -198,7 +184,7 @@ def _std_finalize(sumsq, sum_, count, ddof=0):
 )
 nanstd = Aggregation(
     "nanstd",
-    chunk=(nansum_of_squares, "nansum", _count),
+    chunk=(nansum_of_squares, "nansum", "nanlen"),
     combine=("sum", "sum", "sum"),
     finalize=_std_finalize,
     fill_value=0,
diff --git a/dask_groupby/core.py b/dask_groupby/core.py
@@ -21,7 +21,7 @@
 import pandas as pd
 
 from . import aggregations
-from .aggregations import Aggregation, _atleast_1d, _count, _get_fill_value
+from .aggregations import Aggregation, _atleast_1d, _get_fill_value
 from .xrutils import is_duck_array, is_duck_dask_array
 
 if TYPE_CHECKING:
@@ -525,7 +525,7 @@ def chunk_reduce(
                     size=size,
                     # important when reducing with "offset" groups
                     fill_value=fv,
-                    dtype=dtype,
+                    dtype=np.intp if reduction == "nanlen" else dtype,
                 )
             if np.any(~mask):
                 # remove NaN group label which should be last
@@ -578,13 +578,6 @@ def _finalize_results(
     2. Calling agg.finalize with intermediate results
     3. Mask using counts and fill with user-provided fill_value.
     4. reindex to expected_groups
-
-    Parameters
-    ----------
-    mask_counts: bool
-        Whether to mask out results using counts which is expected to be the last element in
-        results["intermediates"]. Should be False when dask arrays are not involved.
-
     """
     squeezed = _squeeze_results(results, axis)
 
@@ -682,7 +675,7 @@ def _conc2(key1, key2=None, axis=None) -> np.ndarray:
     if agg.reduction_type == "argreduce":
 
         # If _count was added for masking later, we need to account for that
-        if agg.chunk[-1] == _count:
+        if agg.chunk[-1] == "nanlen":
             slicer = slice(None, -1)
         else:
             slicer = slice(None, None)
@@ -701,7 +694,7 @@ def _conc2(key1, key2=None, axis=None) -> np.ndarray:
             backend=backend,
         )
 
-        if agg.chunk[-1] == _count:
+        if agg.chunk[-1] == "nanlen":
             counts = _conc2(key1="intermediates", key2=2, axis=axis)
             # sum the counts
             results["intermediates"].append(
@@ -1116,12 +1109,10 @@ def groupby_reduce(
         # (agg.finalize = None). We still need to do the reindexing step in finalize
         # so that everything matches the dask version.
         reduction.finalize = None
-        # npg's count counts the number of groups
-        # we want to count the number of non-NaN array elements in each group
-        # So we use our custom _count instead of "count"
-        func = reduction.name if reduction.name != "count" else _count
+        # xarray's count is npg's nanlen
+        func = reduction.name if reduction.name != "count" else "nanlen"
         if min_count is not None:
-            func = (func, _count)
+            func = (func, "nanlen")
 
         results = chunk_reduce(
             array,
@@ -1162,7 +1153,7 @@ def groupby_reduce(
 
         # we need to explicitly track counts so that we can mask at the end
         if fill_value is not None or min_count is not None:
-            reduction.chunk += (_count,)
+            reduction.chunk += ("nanlen",)
             reduction.combine += ("sum",)
             reduction.fill_value["intermediate"] += (0,)