cohorts: Delete the merge kwarg (#313)

dcherian · web-flow · commit 0c4a7f9fa290 · 2024-01-05T23:24:42.000Z
So fast now, we do it always!
diff --git a/flox/core.py b/flox/core.py
@@ -248,9 +248,7 @@ def _compute_label_chunk_bitmask(labels, chunks, nlabels):
 
 
 # @memoize
-def find_group_cohorts(
-    labels, chunks, merge: bool = True, expected_groups: None | pd.RangeIndex = None
-) -> dict:
+def find_group_cohorts(labels, chunks, expected_groups: None | pd.RangeIndex = None) -> dict:
     """
     Finds groups labels that occur together aka "cohorts"
 
@@ -265,9 +263,8 @@ def find_group_cohorts(
         represents NaNs.
     chunks : tuple
         chunks of the array being reduced
-    merge : bool, optional
-        Attempt to merge cohorts when one cohort's chunks are a subset
-        of another cohort's chunks.
+    expected_groups: pd.RangeIndex (optional)
+        Used to extract the largest label expected
 
     Returns
     -------
@@ -322,13 +319,7 @@ def invert(x) -> tuple[np.ndarray, ...]:
     # 4. Existing cohorts don't overlap, great for time grouping with perfect chunking
     no_overlapping_cohorts = (np.bincount(np.concatenate(tuple(chunks_cohorts.keys()))) == 1).all()
 
-    if (
-        every_group_one_block
-        or one_group_per_chunk
-        or single_chunks
-        or no_overlapping_cohorts
-        or not merge
-    ):
+    if every_group_one_block or one_group_per_chunk or single_chunks or no_overlapping_cohorts:
         return chunks_cohorts
 
     # Containment = |Q & S| / |Q|
@@ -1569,10 +1560,7 @@ def dask_groupby_agg(
 
         elif method == "cohorts":
             chunks_cohorts = find_group_cohorts(
-                by_input,
-                [array.chunks[ax] for ax in axis],
-                merge=True,
-                expected_groups=expected_groups,
+                by_input, [array.chunks[ax] for ax in axis], expected_groups=expected_groups
             )
             reduced_ = []
             groups_ = []
diff --git a/tests/test_core.py b/tests/test_core.py
@@ -844,21 +844,16 @@ def test_rechunk_for_blockwise(inchunks, expected):
 
 @requires_dask
 @pytest.mark.parametrize(
-    "expected, labels, chunks, merge",
+    "expected, labels, chunks",
     [
-        [[[0, 1, 2, 3]], [0, 1, 2, 0, 1, 2, 3], (3, 4), True],
-        [[[0], [1], [2], [3]], [0, 1, 2, 0, 1, 2, 3], (2, 2, 2, 1), True],
-        [[[0, 1, 2], [3]], [0, 1, 2, 0, 1, 2, 3], (3, 3, 1), True],
-        [
-            [[0], [1, 2, 3, 4], [5]],
-            np.repeat(np.arange(6), [4, 4, 12, 2, 3, 4]),
-            (4, 8, 4, 9, 4),
-            True,
-        ],
+        [[[0, 1, 2, 3]], [0, 1, 2, 0, 1, 2, 3], (3, 4)],
+        [[[0], [1], [2], [3]], [0, 1, 2, 0, 1, 2, 3], (2, 2, 2, 1)],
+        [[[0, 1, 2], [3]], [0, 1, 2, 0, 1, 2, 3], (3, 3, 1)],
+        [[[0], [1, 2, 3, 4], [5]], np.repeat(np.arange(6), [4, 4, 12, 2, 3, 4]), (4, 8, 4, 9, 4)],
     ],
 )
-def test_find_group_cohorts(expected, labels, chunks: tuple[int], merge: bool) -> None:
-    actual = list(find_group_cohorts(labels, (chunks,), merge).values())
+def test_find_group_cohorts(expected, labels, chunks: tuple[int]) -> None:
+    actual = list(find_group_cohorts(labels, (chunks,)).values())
     assert actual == expected, (actual, expected)