Support numba backend

dcherian · dcherian · commit d4716223b7c0 · 2021-10-03T21:40:05.000+05:30
Closes #17 commit 7c79cf65ab797201b8fcfc264a5596ddeb85fbdc Author: dcherian <deepak@cherian.net> Date: Sun Oct 3 18:18:40 2021 +0530 Signature improvements.
diff --git a/ci/environment.yml b/ci/environment.yml
@@ -15,5 +15,6 @@ dependencies:
   - numpy_groupies
   - pooch
   - toolz
+  - numba
   - pip:
     - icecream
diff --git a/dask_groupby/core.py b/dask_groupby/core.py
@@ -32,6 +32,17 @@
 FinalResultsDict = Dict[str, Union["DaskArray", np.ndarray]]
 
 
+def _get_aggregate(backend):
+    if backend == "numba":
+        return npg.aggregate_numba.aggregate
+    elif backend == "numpy":
+        return npg.aggregate_numpy.aggregate
+    else:
+        raise ValueError(
+            "Expected backend to be one of ['numpy', 'numba']. Received {backend} instead."
+        )
+
+
 def _get_chunk_reduction(reduction_type: str) -> Callable:
     if reduction_type == "reduce":
         return chunk_reduce
@@ -353,6 +364,7 @@ def chunk_argreduce(
     dtype=None,
     reindex: bool = False,
     isbin: bool = False,
+    backend: str = "numpy",
 ) -> IntermediateDict:
     """
     Per-chunk arg reduction.
@@ -371,6 +383,7 @@ def chunk_argreduce(
         fill_value=fill_value,
         isbin=isbin,
         dtype=dtype,
+        backend=backend,
     )
     if not np.isnan(results["groups"]).all():
         # will not work for empty groups...
@@ -398,6 +411,7 @@ def chunk_reduce(
     dtype=None,
     reindex: bool = False,
     isbin: bool = False,
+    backend: str = "numpy",
 ) -> IntermediateDict:
     """
     Wrapper for numpy_groupies aggregate that supports nD ``array`` and
@@ -503,7 +517,7 @@ def chunk_reduce(
                     fill_value=fv,
                 )
             else:
-                result = npg.aggregate_numpy.aggregate(
+                result = _get_aggregate(backend)(
                     group_idx,
                     array,
                     axis=-1,
@@ -612,9 +626,10 @@ def _npg_aggregate(
     group_ndim: int,
     fill_value: Any = None,
     min_count: Optional[int] = None,
+    backend: str = "numpy",
 ) -> FinalResultsDict:
     """Final aggregation step of tree reduction"""
-    results = _npg_combine(x_chunk, agg, axis, keepdims, group_ndim)
+    results = _npg_combine(x_chunk, agg, axis, keepdims, group_ndim, backend)
     return _finalize_results(results, agg, axis, expected_groups, fill_value, min_count)
 
 
@@ -624,6 +639,7 @@ def _npg_combine(
     axis: Sequence,
     keepdims: bool,
     group_ndim: int,
+    backend: str,
 ) -> IntermediateDict:
     """Combine intermediates step of tree reduction."""
     from dask.array.core import _concatenate2
@@ -682,6 +698,7 @@ def _conc2(key1, key2=None, axis=None) -> np.ndarray:
             expected_groups=None,
             fill_value=agg.fill_value["intermediate"][slicer],
             dtype=agg.dtype,
+            backend=backend,
         )
 
         if agg.chunk[-1] == _count:
@@ -696,6 +713,7 @@ def _conc2(key1, key2=None, axis=None) -> np.ndarray:
                     expected_groups=None,
                     fill_value=(0,),
                     dtype=np.intp,
+                    backend=backend,
                 )["intermediates"][0]
             )
 
@@ -720,6 +738,7 @@ def _conc2(key1, key2=None, axis=None) -> np.ndarray:
                     axis=axis,
                     expected_groups=None,
                     fill_value=fv,
+                    backend=backend,
                 )
                 results["intermediates"].append(*_results["intermediates"])
                 results["groups"] = _results["groups"]
@@ -769,6 +788,7 @@ def groupby_agg(
     method: str = "mapreduce",
     min_count: Optional[int] = None,
     isbin: bool = False,
+    backend: str = "numpy",
 ) -> Tuple["DaskArray", Union[np.ndarray, "DaskArray"]]:
 
     import dask.array
@@ -806,6 +826,7 @@ def groupby_agg(
             fill_value=agg.fill_value["intermediate"],
             isbin=isbin,
             reindex=split_out > 1,
+            backend=backend,
         ),
         inds,
         array,
@@ -851,8 +872,9 @@ def groupby_agg(
                 group_ndim=by.ndim,
                 fill_value=fill_value,
                 min_count=min_count,
+                backend=backend,
             ),
-            combine=partial(_npg_combine, agg=agg, group_ndim=by.ndim),
+            combine=partial(_npg_combine, agg=agg, group_ndim=by.ndim, backend=backend),
             name=f"{name}-reduce",
             dtype=array.dtype,
             axis=axis,
@@ -880,6 +902,7 @@ def groupby_agg(
                 group_ndim=by.ndim,
                 fill_value=fill_value,
                 min_count=min_count,
+                backend=backend,
                 axis=axis,
                 keepdims=True,
             ),
@@ -963,6 +986,7 @@ def groupby_reduce(
     min_count: Optional[int] = None,
     split_out: int = 1,
     method: str = "mapreduce",
+    backend: str = "numpy",
 ) -> Tuple["DaskArray", Union[np.ndarray, "DaskArray"]]:
     """
     GroupBy reductions using tree reductions for dask.array
@@ -1005,6 +1029,8 @@ def groupby_reduce(
                         This works well for many time groupings where the group labels repeat
                         at regular intervals like 'hour', 'month', dayofyear' etc. Optimize
                         chunking ``array`` for this method by first rechunking using ``rechunk_for_cohorts``.
+    backend: {"numpy", "numba"}, optional
+        Backend  for numpy_groupies. numpy by default.
 
     Returns
     -------
@@ -1148,6 +1174,7 @@ def groupby_reduce(
             fill_value=fill_value,
             min_count=min_count,
             isbin=isbin,
+            backend=backend,
         )
         if method == "cohorts":
             assert len(axis) == 1
diff --git a/dask_groupby/xarray.py b/dask_groupby/xarray.py
@@ -57,6 +57,7 @@ def xarray_reduce(
     split_out: int = 1,
     fill_value=None,
     method: str = "mapreduce",
+    backend: str = "numpy",
     keep_attrs: bool = True,
     skipna: bool = True,
     min_count: Optional[int] = None,
@@ -105,8 +106,15 @@ def xarray_reduce(
                         'month', dayofyear' etc. Optimize chunking ``array`` for this
                         method by first rechunking using ``rechunk_for_cohorts``.
 
-    skipna: bool
+    backend: {"numpy", "numba"}, optional
+        Backend for numpy_groupies
+    keep_attrs: bool, optional
+        Preserve attrs?
+    skipna: bool, optional
         Use NaN-skipping aggregations like nanmean?
+    min_count: int, optional
+        NaN out when number of non-NaN values in aggregation is < min_count
+        Only applies to nansum, nanprod.
 
     Raises
     ------
@@ -266,6 +274,7 @@ def wrapper(*args, **kwargs):
             "fill_value": fill_value,
             "method": method,
             "min_count": min_count,
+            "backend": backend,
             # The following mess exists becuase for multiple `by`s I factorize eagerly
             # here before passing it on; this means I have to handle the
             # "binning by single by variable" case explicitly where the factorization
diff --git a/tests/test_core.py b/tests/test_core.py
@@ -31,6 +31,7 @@ def test_alignment_error():
         groupby_reduce(da, labels, func="mean")
 
 
+@pytest.mark.parametrize("backend", ["numpy", "numba"])
 @pytest.mark.parametrize("dtype", (float, int))
 @pytest.mark.parametrize("chunk, split_out", [(False, 1), (True, 1), (True, 2), (True, 3)])
 @pytest.mark.parametrize("expected_groups", [None, [0, 1, 2], np.array([0, 1, 2])])
@@ -59,7 +60,9 @@ def test_alignment_error():
         # (np.ones((12,)), np.array([labels, labels])),  # form 4
     ],
 )
-def test_groupby_reduce(array, by, expected, func, expected_groups, chunk, split_out, dtype):
+def test_groupby_reduce(
+    array, by, expected, func, expected_groups, chunk, split_out, dtype, backend
+):
     array = array.astype(dtype)
     if chunk:
         if expected_groups is None:
@@ -81,10 +84,12 @@ def test_groupby_reduce(array, by, expected, func, expected_groups, chunk, split
         expected_groups=expected_groups,
         fill_value=123,
         split_out=split_out,
+        backend=backend,
     )
     assert_equal(expected, result)
 
 
+@pytest.mark.parametrize("backend", ["numpy", "numba"])
 @pytest.mark.parametrize("size", ((12,), (12, 5)))
 @pytest.mark.parametrize(
     "func",
@@ -109,7 +114,7 @@ def test_groupby_reduce(array, by, expected, func, expected_groups, chunk, split
         pytest.param("nanargmin", marks=(pytest.mark.xfail,)),
     ),
 )
-def test_groupby_reduce_all(size, func):
+def test_groupby_reduce_all(size, func, backend):
 
     array = np.random.randn(*size)
     by = np.ones(size[-1])
@@ -123,13 +128,15 @@ def test_groupby_reduce_all(size, func):
         expected = getattr(np, func)(array, axis=-1)
     expected = np.expand_dims(expected, -1)
 
-    actual, _ = groupby_reduce(array, by, func=func)
+    actual, _ = groupby_reduce(array, by, func=func, backend=backend)
     if "arg" in func:
         assert actual.dtype.kind == "i"
     assert_equal(actual, expected)
 
     for method in ["mapreduce", "cohorts"]:
-        actual, _ = groupby_reduce(da.from_array(array, chunks=3), by, func=func, method=method)
+        actual, _ = groupby_reduce(
+            da.from_array(array, chunks=3), by, func=func, method=method, backend=backend
+        )
         if "arg" in func:
             assert actual.dtype.kind == "i"
         assert_equal(actual, expected)
@@ -336,14 +343,15 @@ def test_dask_reduce_axis_subset():
         )
 
 
+@pytest.mark.parametrize("backend", ["numpy", "numba"])
 @pytest.mark.parametrize(
     "axis", [None, (0, 1, 2), (0, 1), (0, 2), (1, 2), 0, 1, 2, (0,), (1,), (2,)]
 )
-def test_groupby_reduce_axis_subset_against_numpy(axis):
+def test_groupby_reduce_axis_subset_against_numpy(axis, backend):
     # tests against the numpy output to make sure dask compute matches
     by = np.broadcast_to(labels2d, (3, *labels2d.shape))
     array = np.ones_like(by)
-    kwargs = dict(func="count", axis=axis, expected_groups=[0, 2], fill_value=123)
+    kwargs = dict(func="count", axis=axis, expected_groups=[0, 2], fill_value=123, backend=backend)
     with raise_if_dask_computes():
         actual, _ = groupby_reduce(
             da.from_array(array, chunks=(-1, 2, 3)),
@@ -354,6 +362,7 @@ def test_groupby_reduce_axis_subset_against_numpy(axis):
     assert_equal(actual, expected)
 
 
+@pytest.mark.parametrize("backend", ["numpy", "numba"])
 @pytest.mark.parametrize("chunks", [None, (2, 2, 3)])
 @pytest.mark.parametrize(
     "axis, groups, expected_shape",
@@ -363,7 +372,7 @@ def test_groupby_reduce_axis_subset_against_numpy(axis):
         (None, [0], (1,)),  # global reduction; 0 shaped group axis; 1 group
     ],
 )
-def test_groupby_reduce_nans(chunks, axis, groups, expected_shape):
+def test_groupby_reduce_nans(chunks, axis, groups, expected_shape, backend):
     def _maybe_chunk(arr):
         if chunks:
             return da.from_array(arr, chunks=chunks)
@@ -383,6 +392,7 @@ def _maybe_chunk(arr):
         expected_groups=groups,
         axis=axis,
         fill_value=0,
+        backend=backend,
     )
     assert_equal(result, np.zeros(expected_shape, dtype=np.int64))
 
@@ -394,7 +404,8 @@ def _maybe_chunk(arr):
     # by = np.broadcast_to(labels2d, (3, *labels2d.shape))
 
 
-def test_groupby_all_nan_blocks():
+@pytest.mark.parametrize("backend", ["numpy", "numba"])
+def test_groupby_all_nan_blocks(backend):
     labels = np.array([0, 0, 2, 2, 2, 1, 1, 2, 2, 1, 1, 0])
     nan_labels = labels.astype(float)  # copy
     nan_labels[:5] = np.nan
@@ -410,6 +421,7 @@ def test_groupby_all_nan_blocks():
         da.from_array(by, chunks=(1, 3)),
         func="sum",
         expected_groups=None,
+        backend=backend,
     )
     assert_equal(actual, expected)
 
diff --git a/tests/test_xarray.py b/tests/test_xarray.py
@@ -16,10 +16,11 @@
 dask.config.set(scheduler="sync")
 
 
+@pytest.mark.parametrize("backend", ["numpy", "numba"])
 @pytest.mark.parametrize("min_count", [None, 1, 3])
 @pytest.mark.parametrize("add_nan", [True, False])
 @pytest.mark.parametrize("skipna", [True, False])
-def test_xarray_reduce(skipna, add_nan, min_count):
+def test_xarray_reduce(skipna, add_nan, min_count, backend):
     arr = np.ones((4, 12))
 
     if add_nan:
@@ -38,7 +39,9 @@ def test_xarray_reduce(skipna, add_nan, min_count):
     ).expand_dims(z=4)
 
     expected = da.groupby("labels").sum(skipna=skipna, min_count=min_count)
-    actual = xarray_reduce(da, "labels", func="sum", skipna=skipna, min_count=min_count)
+    actual = xarray_reduce(
+        da, "labels", func="sum", skipna=skipna, min_count=min_count, backend=backend
+    )
     assert_equal(expected, actual)
 
     # test dimension ordering