Support method="blockwise" for scans

dcherian · claude · dcherian · commit 1b30f08a1478 · 2025-12-09T00:06:13.000-07:00
🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
diff --git a/flox/core.py b/flox/core.py
@@ -113,6 +113,7 @@
     _is_reindex_sparse_supported_reduction,
     _issorted,
     _postprocess_numbagg,
+    _should_auto_rechunk_blockwise,
 )
 
 
@@ -962,7 +963,7 @@ def groupby_reduce(
     has_dask = is_duck_dask_array(array) or is_duck_dask_array(by_)
     has_cubed = is_duck_cubed_array(array) or is_duck_cubed_array(by_)
 
-    if method is None and is_duck_dask_array(array) and not any_by_dask and by_.ndim == 1 and _issorted(by_):
+    if _should_auto_rechunk_blockwise(method, array, any_by_dask, by_):
         # Let's try rechunking for sorted 1D by.
         (single_axis,) = axis_
         method, array = rechunk_for_blockwise(array, single_axis, by_, force=False)
diff --git a/flox/dask.py b/flox/dask.py
@@ -14,12 +14,17 @@
 import toolz as tlz
 
 if TYPE_CHECKING:
-    from .aggregations import Aggregation, Scan
+    from typing import Literal
+
+    from .aggregations import Aggregation
     from .core import T_Axes, T_Engine, T_Method
     from .lib import ArrayLayer
     from .reindex import ReindexArrayType, ReindexStrategy
     from .types import DaskArray, Graph, IntermediateDict, T_By
 
+    T_ScanMethod = Literal["blelloch", "blockwise"]
+
+from .aggregations import Scan, scan_binary_op
 from .core import (
     DUMMY_AXIS,
     _get_chunk_reduction,
@@ -34,6 +39,7 @@
     ReindexStrategy,
     reindex_,
 )
+from .scan import _finalize_scan, _zip, chunk_scan, grouped_reduce
 from .types import FinalResultsDict, IntermediateDict
 from .xrutils import is_duck_dask_array, notnull
 
@@ -567,49 +573,91 @@ def dask_groupby_agg(
     return (result, groups)
 
 
-def dask_groupby_scan(array, by, axes: T_Axes, agg: Scan) -> DaskArray:
+def dask_groupby_scan(array, by, axes: T_Axes, agg: Scan, method: T_ScanMethod = "blelloch") -> DaskArray:
+    """Grouped scan for dask arrays.
+
+    Parameters
+    ----------
+    array : DaskArray
+        Input array to scan.
+    by : DaskArray
+        Group labels array, must have same chunks as array along scan axis.
+    axes : T_Axes
+        Tuple of axes to scan along (must be single axis).
+    agg : Scan
+        Scan aggregation specification.
+    method : {"blelloch", "blockwise"}, optional
+        Scan method to use:
+        - "blelloch": Blelloch parallel prefix scan algorithm, allows scanning
+          across chunk boundaries using tree reduction. Default.
+        - "blockwise": Each chunk is processed independently. Only valid when
+          all members of each group are contained within a single chunk.
+
+    Returns
+    -------
+    DaskArray
+        Result of the grouped scan with same shape and chunks as input.
+    """
     from dask.array import map_blocks
     from dask.array.reductions import cumreduction as scan
-
-    from .aggregations import scan_binary_op
-    from .scan import _finalize_scan, _zip, chunk_scan, grouped_reduce
+    from dask.base import tokenize
 
     if len(axes) > 1:
         raise NotImplementedError("Scans are only supported along a single axis.")
     (axis,) = axes
 
     array, by = _unify_chunks(array, by)
 
+    # Include method in token to differentiate task graphs
+    token = tokenize(array, by, agg, axes, method)
+
     # 1. zip together group indices & array
     zipped = map_blocks(
         _zip,
         by,
         array,
         dtype=array.dtype,
         meta=array._meta,
-        name="groupby-scan-preprocess",
+        name=f"groupby-scan-preprocess-{token}",
     )
 
-    scan_ = partial(chunk_scan, agg=agg)
-    # dask tokenizing error workaround
-    scan_.__name__ = scan_.func.__name__  # type: ignore[attr-defined]
-
     # 2. Run the scan
-    accumulated = scan(
-        func=scan_,
-        binop=partial(scan_binary_op, agg=agg),
-        ident=agg.identity,
-        x=zipped,
-        axis=axis,
-        # TODO: support method="sequential" here.
-        method="blelloch",
-        preop=partial(grouped_reduce, agg=agg),
+    if method == "blockwise":
+        # Apply chunk_scan blockwise - each block independently
+        scan_func = partial(chunk_scan, agg=agg, axis=axis, dtype=agg.dtype)
+        scanned = map_blocks(
+            scan_func,
+            zipped,
+            dtype=agg.dtype,
+            meta=array._meta,
+            name=f"groupby-scan-{token}",
+        )
+    else:
+        # Use Blelloch parallel prefix scan algorithm
+        scan_ = partial(chunk_scan, agg=agg)
+        # dask tokenizing error workaround
+        scan_.__name__ = scan_.func.__name__  # type: ignore[attr-defined]
+
+        scanned = scan(
+            func=scan_,
+            binop=partial(scan_binary_op, agg=agg),
+            ident=agg.identity,
+            x=zipped,
+            axis=axis,
+            # TODO: support method="sequential" here.
+            method="blelloch",
+            preop=partial(grouped_reduce, agg=agg),
+            dtype=agg.dtype,
+        )
+
+    # 3. Extract final result
+    result = map_blocks(
+        partial(_finalize_scan, dtype=agg.dtype),
+        scanned,
         dtype=agg.dtype,
+        name=f"groupby-scan-finalize-{token}",
     )
 
-    # 3. Unzip and extract the final result array, discard groups
-    result = map_blocks(partial(_finalize_scan, dtype=agg.dtype), accumulated, dtype=agg.dtype)
-
     assert result.chunks == array.chunks
 
     return result
diff --git a/flox/lib.py b/flox/lib.py
@@ -4,7 +4,7 @@
 from typing import TYPE_CHECKING, TypeAlias, TypeVar
 
 from .types import DaskArray, Graph
-from .xrutils import module_available
+from .xrutils import is_duck_dask_array, module_available
 
 if TYPE_CHECKING:
     from .aggregations import Aggregation
@@ -78,6 +78,11 @@ def _issorted(arr, ascending=True) -> bool:
         return bool((arr[:-1] >= arr[1:]).all())
 
 
+def _should_auto_rechunk_blockwise(method, array, any_by_dask: bool, by) -> bool:
+    """Check if we should attempt automatic rechunking for blockwise operations."""
+    return method is None and is_duck_dask_array(array) and not any_by_dask and by.ndim == 1 and _issorted(by)
+
+
 def _is_nanlen(reduction) -> bool:
     return isinstance(reduction, str) and reduction == "nanlen"
 
diff --git a/flox/scan.py b/flox/scan.py
@@ -19,7 +19,10 @@
     _atleast_1d,
     generic_aggregate,
 )
+from .cohorts import find_group_cohorts
 from .factorize import _factorize_multiple
+from .lib import _should_auto_rechunk_blockwise
+from .rechunk import rechunk_for_blockwise
 from .xrutils import is_duck_array, is_duck_dask_array, module_available
 
 if module_available("numpy", minversion="2.0.0"):
@@ -28,6 +31,8 @@
     from numpy.core.numeric import normalize_axis_tuple  # type: ignore[no-redef]
 
 if TYPE_CHECKING:
+    from typing import Literal
+
     from .core import (
         T_By,
         T_EngineOpt,
@@ -37,6 +42,41 @@
     )
     from .types import DaskArray
 
+    T_ScanMethod = Literal["blockwise", "blelloch"]
+
+
+def _choose_scan_method(
+    method: T_MethodOpt, preferred_method: T_ScanMethod, nax: int, by_ndim: int
+) -> T_ScanMethod:
+    """Choose the scan method based on user input and preferred method.
+
+    Parameters
+    ----------
+    method : T_MethodOpt
+        User-specified method, or None for automatic selection.
+    preferred_method : T_ScanMethod
+        The preferred method based on data layout analysis.
+    nax : int
+        Number of axes being reduced.
+    by_ndim : int
+        Number of dimensions in the `by` array.
+
+    Returns
+    -------
+    T_ScanMethod
+        The chosen scan method: "blockwise" or "blelloch".
+    """
+    if method is None:
+        # Scans must reduce along all dimensions of by for blockwise
+        if nax != by_ndim:
+            return "blelloch"
+        return preferred_method
+    elif method == "blockwise":
+        return "blockwise"
+    else:
+        # For any other method (including map-reduce, cohorts), use blelloch
+        return "blelloch"
+
 
 def _validate_expected_groups(nby, expected_groups):
     """Validate expected_groups for scan operations."""
@@ -91,8 +131,8 @@ def groupby_scan(
         Value to assign when a label in ``expected_groups`` is not present.
     dtype : data-type , optional
         DType for the output. Can be anything that is accepted by ``np.dtype``.
-    method : {"blockwise", "cohorts"}, optional
-        Strategy for reduction of dask arrays only:
+    method : {"blockwise", "blelloch"}, optional
+        Strategy for scan of dask arrays only:
           * ``"blockwise"``:
             Only scan using blockwise and avoid aggregating blocks
             together. Useful for resampling-style groupby problems where group
@@ -101,14 +141,10 @@ def groupby_scan(
             i.e. each block contains all members of any group present
             in that block. For nD `by`, you must make sure that all members of a group
             are present in a single block.
-          * ``"cohorts"``:
-            Finds group labels that tend to occur together ("cohorts"),
-            indexes out cohorts and reduces that subset using "map-reduce",
-            repeat for all cohorts. This works well for many time groupings
-            where the group labels repeat at regular intervals like 'hour',
-            'month', dayofyear' etc. Optimize chunking ``array`` for this
-            method by first rechunking using ``rechunk_for_cohorts``
-            (for 1D ``by`` only).
+          * ``"blelloch"``:
+            Use Blelloch's parallel prefix scan algorithm, which allows
+            scanning across chunk boundaries. This is the default when groups
+            span multiple chunks.
     engine : {"flox", "numpy", "numba", "numbagg"}, optional
         Algorithm to compute the groupby reduction on non-dask arrays and on each dask chunk:
           * ``"numpy"``:
@@ -149,8 +185,6 @@ def groupby_scan(
 
     if engine is not None:
         raise NotImplementedError("Setting `engine` is not supported for scans yet.")
-    if method is not None:
-        raise NotImplementedError("Setting `method` is not supported for scans yet.")
     if engine is None:
         engine = "flox"
     assert engine == "flox"
@@ -191,6 +225,38 @@ def groupby_scan(
     by_: np.ndarray
     (by_,) = bys
     has_dask = is_duck_dask_array(array) or is_duck_dask_array(by_)
+    nax = len(axis_)
+
+    # Method selection for dask arrays
+    scan_method: T_ScanMethod = "blelloch"
+    if has_dask:
+        (single_axis,) = axis_  # type: ignore[misc]
+
+        # Try rechunking for sorted 1D by when method is not specified
+        if _should_auto_rechunk_blockwise(method, array, any_by_dask, by_):
+            rechunk_method, array = rechunk_for_blockwise(array, single_axis, by_, force=False)
+            if rechunk_method == "blockwise":
+                method = "blockwise"
+
+        # Determine preferred method based on data layout
+        if not any_by_dask and method is None:
+            cohorts_method, _ = find_group_cohorts(
+                by_,
+                [array.chunks[ax] for ax in range(-by_.ndim, 0)],  # type: ignore[union-attr]
+                expected_groups=None,
+                merge=False,
+            )
+            # Map groupby_reduce methods to scan methods
+            preferred_method: T_ScanMethod = "blockwise" if cohorts_method == "blockwise" else "blelloch"
+        else:
+            preferred_method = "blelloch"
+
+        # Choose the final method
+        scan_method = _choose_scan_method(method, preferred_method, nax, by_.ndim)
+
+        # Rechunk if blockwise was explicitly requested but data isn't aligned
+        if preferred_method != "blockwise" and scan_method == "blockwise" and by_.ndim == 1:
+            _, array = rechunk_for_blockwise(array, axis=-1, labels=by_)
 
     if array.dtype.kind in "Mm":
         cast_to = array.dtype
@@ -237,7 +303,7 @@ def groupby_scan(
     else:
         from .dask import dask_groupby_scan
 
-        result = dask_groupby_scan(inp.array, inp.group_idx, axes=axis_, agg=agg)
+        result = dask_groupby_scan(inp.array, inp.group_idx, axes=axis_, agg=agg, method=scan_method)
 
     # Made a design choice here to have `postprocess` handle both array and group_idx
     out = AlignedArrays(array=result, group_idx=by_)
diff --git a/tests/test_core.py b/tests/test_core.py
@@ -2072,6 +2072,52 @@ def test_blockwise_nans() -> None:
     assert_equal(expected, actual)
 
 
+@requires_dask
+@pytest.mark.parametrize("func", ["nancumsum", "ffill", "bfill"])
+@pytest.mark.parametrize("method", ["blockwise", "blelloch"])
+def test_groupby_scan_method(func, method) -> None:
+    """Test that groupby_scan works correctly with explicit method parameter."""
+    # Create array where groups fit within chunks (suitable for blockwise)
+    # Include NaN values for ffill/bfill to actually test gap filling
+    if "fill" in func:
+        data = [1.0, np.nan, 3.0, 4.0, np.nan, 6.0]
+    else:
+        data = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]
+    array = dask.array.from_array(data, chunks=3)
+    by = np.array([0, 0, 0, 1, 1, 1])
+
+    expected = groupby_scan(array.compute(), by, func=func, axis=-1)
+    actual = groupby_scan(array, by, func=func, axis=-1, method=method)
+
+    assert_equal(expected, actual)
+
+
+@requires_dask
+def test_groupby_scan_blockwise_auto_rechunk() -> None:
+    """Test that blockwise scan auto-rechunks when groups are sorted but span chunks."""
+    from flox import scan
+    from flox.rechunk import rechunk_for_blockwise as real_rechunk
+
+    # Create array with sorted groups that span chunk boundaries
+    array = dask.array.from_array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], chunks=2)
+    by = np.array([0, 0, 0, 1, 1, 1])  # sorted, but group 0 spans chunks 0 and 1
+
+    expected = groupby_scan(array.compute(), by, func="nancumsum", axis=-1)
+
+    # This should auto-rechunk to enable blockwise
+    with patch.object(scan, "rechunk_for_blockwise", wraps=real_rechunk) as rechunk_spy:
+        actual = groupby_scan(array, by, func="nancumsum", axis=-1)
+        assert_equal(expected, actual)
+        # Verify rechunk_for_blockwise was called
+        assert rechunk_spy.call_count >= 1
+
+    # Explicit method="blockwise" should also rechunk and produce correct results
+    with patch.object(scan, "rechunk_for_blockwise", wraps=real_rechunk) as rechunk_spy:
+        actual_explicit = groupby_scan(array, by, func="nancumsum", axis=-1, method="blockwise")
+        assert_equal(expected, actual_explicit)
+        assert rechunk_spy.call_count >= 1
+
+
 @pytest.mark.parametrize("func", ["sum", "prod", "count", "nansum"])
 @pytest.mark.parametrize("engine", ["flox", "numpy"])
 def test_agg_dtypes(func, engine) -> None: