Fix missing resampling groups. (#312)

dcherian · web-flow · commit 58bc9bef1e26 · 2024-01-05T15:41:44.000-07:00
* Fix missing resampling groups. Closes pydata/xarray#8592 * Update climatology notebook
diff --git a/asv_bench/benchmarks/cohorts.py b/asv_bench/benchmarks/cohorts.py
@@ -73,7 +73,11 @@ class NWMMidwest(Cohorts):
     def setup(self, *args, **kwargs):
         x = np.repeat(np.arange(30), 150)
         y = np.repeat(np.arange(30), 60)
-        self.by = x[np.newaxis, :] * y[:, np.newaxis]
+        by = x[np.newaxis, :] * y[:, np.newaxis]
+
+        self.by = flox.core._factorize_multiple(
+            (by,), expected_groups=(None,), any_by_dask=False, reindex=False
+        )[0][0]
 
         self.array = dask.array.ones(self.by.shape, chunks=(350, 350))
         self.axis = (-2, -1)
diff --git a/docs/source/user-stories/climatology.ipynb b/docs/source/user-stories/climatology.ipynb
@@ -22,8 +22,6 @@
    "outputs": [],
    "source": [
     "import dask.array\n",
-    "import matplotlib.pyplot as plt\n",
-    "import numpy as np\n",
     "import pandas as pd\n",
     "import xarray as xr\n",
     "\n",
@@ -56,6 +54,27 @@
     "oisst"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "b7f519ee-e575-492c-a70b-8dad63a8c222",
+   "metadata": {},
+   "source": [
+    "To account for Feb-29 being present in some years, we'll construct a time vector to group by as \"mmm-dd\" string.\n",
+    "\n",
+    "For more options, see https://strftime.org/"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3c42a618-47bc-4c83-a902-ec4cf3420180",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "day = oisst.time.dt.strftime(\"%h-%d\").rename(\"day\")\n",
+    "day"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "6d913e7f-25bd-43c4-98b6-93bcb420c524",
@@ -80,7 +99,7 @@
    "source": [
     "flox.xarray.xarray_reduce(\n",
     "    oisst,\n",
-    "    oisst.time.dt.dayofyear,\n",
+    "    day,\n",
     "    func=\"mean\",\n",
     "    method=\"map-reduce\",\n",
     ")"
@@ -106,7 +125,7 @@
    "source": [
     "flox.xarray.xarray_reduce(\n",
     "    oisst.chunk({\"lat\": -1, \"lon\": 120}),\n",
-    "    oisst.time.dt.dayofyear,\n",
+    "    day,\n",
     "    func=\"mean\",\n",
     "    method=\"map-reduce\",\n",
     ")"
@@ -143,7 +162,7 @@
    "source": [
     "flox.xarray.xarray_reduce(\n",
     "    oisst,\n",
-    "    oisst.time.dt.dayofyear,\n",
+    "    day,\n",
     "    func=\"mean\",\n",
     "    method=\"cohorts\",\n",
     ")"
@@ -160,10 +179,7 @@
     "[click here](https://flox.readthedocs.io/en/latest/implementation.html#method-cohorts)).\n",
     "Now we have the opposite problem: the chunk sizes on the output are too small.\n",
     "\n",
-    "Looking more closely, We can see the cohorts that `flox` has detected are not\n",
-    "really cohorts, each cohort is a single group label. We've replicated Xarray's\n",
-    "current strategy; what flox calls\n",
-    "[\"split-reduce\"](https://flox.readthedocs.io/en/latest/implementation.html#method-split-reduce-xarray-s-current-groupby-strategy)\n"
+    "Let us inspect the cohorts"
    ]
   },
   {
@@ -173,112 +189,81 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "flox.core.find_group_cohorts(\n",
-    "    labels=oisst.time.dt.dayofyear.data,\n",
+    "# integer codes for each \"day\"\n",
+    "codes, _ = pd.factorize(day.data)\n",
+    "cohorts = flox.core.find_group_cohorts(\n",
+    "    labels=codes,\n",
     "    chunks=(oisst.chunksizes[\"time\"],),\n",
-    ").values()"
+    ")\n",
+    "print(len(cohorts))"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "bcbdbb3b-2aed-4f3f-ad20-efabb52b5e68",
+   "id": "068b4109-b7f4-4c16-918d-9a18ff2ed183",
    "metadata": {},
    "source": [
-    "## Rechunking data for cohorts\n",
-    "\n",
-    "Can we fix the \"out of phase\" problem by rechunking along time?\n",
-    "\n",
-    "First lets see where the current chunk boundaries are\n"
+    "Looking more closely, we can see many cohorts with a single entry. "
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "90a884bc-1b71-4874-8143-73b3b5c41458",
+   "id": "57983cd0-a2e0-4d16-abe6-9572f6f252bf",
    "metadata": {},
    "outputs": [],
    "source": [
-    "array = oisst.data\n",
-    "labels = oisst.time.dt.dayofyear.data\n",
-    "axis = oisst.get_axis_num(\"time\")\n",
-    "oldchunks = array.chunks[axis]\n",
-    "oldbreaks = np.insert(np.cumsum(oldchunks), 0, 0)\n",
-    "labels_at_breaks = labels[oldbreaks[:-1]]\n",
-    "labels_at_breaks"
+    "cohorts.values()"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "4b2573e5-0d30-4cb8-b5af-751b824f0689",
+   "id": "bcbdbb3b-2aed-4f3f-ad20-efabb52b5e68",
    "metadata": {},
    "source": [
-    "Now we'll use a convenient function `rechunk_for_cohorts` to rechunk the `oisst`\n",
-    "dataset along time. We'll ask it to rechunk so that a new chunk starts at each\n",
-    "of the elements\n",
+    "## Rechunking data for cohorts\n",
     "\n",
-    "```\n",
-    "[244, 264, 284, 304, 324, 344, 364,  19,  39,  59,  79,  99, 119,\n",
-    " 139, 159, 179, 199, 219, 239]\n",
-    "```\n",
+    "Can we fix the \"out of phase\" problem by rechunking along time?\n",
     "\n",
-    "These are labels at the chunk boundaries in the first year of data. We are\n",
-    "forcing that chunking pattern to repeat as much as possible. We also tell the\n",
-    "function to ignore any existing chunk boundaries.\n"
+    "First lets see where the current chunk boundaries are"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "a9ab6382-e93b-49e9-8e2e-1ba526046aea",
+   "id": "40d393a5-7a4e-4d33-997b-4c422a0b8100",
    "metadata": {},
    "outputs": [],
    "source": [
-    "rechunked = flox.xarray.rechunk_for_cohorts(\n",
-    "    oisst,\n",
-    "    dim=\"time\",\n",
-    "    labels=oisst.time.dt.dayofyear,\n",
-    "    force_new_chunk_at=[\n",
-    "        244,\n",
-    "        264,\n",
-    "        284,\n",
-    "        304,\n",
-    "        324,\n",
-    "        344,\n",
-    "        364,\n",
-    "        19,\n",
-    "        39,\n",
-    "        59,\n",
-    "        79,\n",
-    "        99,\n",
-    "        119,\n",
-    "        139,\n",
-    "        159,\n",
-    "        179,\n",
-    "        199,\n",
-    "        219,\n",
-    "        239,\n",
-    "    ],\n",
-    "    ignore_old_chunks=True,\n",
-    ")\n",
-    "rechunked"
+    "oisst.chunksizes[\"time\"][:10]"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "570d869b-9612-4de9-83ee-336a35c1fdad",
+   "id": "cd0033a3-d211-4aef-a284-c9fd3f75f6e4",
+   "metadata": {},
+   "source": [
+    "We'll choose to rechunk such that a single month in is a chunk. This is not too different from the current chunking but will help your periodicity problem"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5914a350-a7db-49b3-9504-6d63ff874f5e",
    "metadata": {},
+   "outputs": [],
    "source": [
-    "We see that chunks are mostly 20 elements long in time with some differences\n"
+    "newchunks = xr.ones_like(day).astype(int).resample(time=\"M\").count()"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "86bb4461-d921-40f8-9ff7-8d6e7e8c7e4b",
+   "id": "90a884bc-1b71-4874-8143-73b3b5c41458",
    "metadata": {},
    "outputs": [],
    "source": [
-    "plt.plot(rechunked.chunksizes[\"time\"], marker=\"x\", ls=\"none\")"
+    "rechunked = oisst.chunk(time=tuple(newchunks.data))"
    ]
   },
   {
@@ -296,10 +281,22 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "flox.core.find_group_cohorts(\n",
-    "    labels=rechunked.time.dt.dayofyear.data,\n",
+    "new_cohorts = flox.core.find_group_cohorts(\n",
+    "    labels=codes,\n",
     "    chunks=(rechunked.chunksizes[\"time\"],),\n",
-    ").values()"
+    ")\n",
+    "# one cohort per month!\n",
+    "len(new_cohorts)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4e2b6f70-c057-4783-ad55-21b20ff27e7f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "new_cohorts.values()"
    ]
   },
   {
@@ -318,7 +315,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "flox.xarray.xarray_reduce(rechunked, rechunked.time.dt.dayofyear, func=\"mean\", method=\"cohorts\")"
+    "flox.xarray.xarray_reduce(rechunked, day, func=\"mean\", method=\"cohorts\")"
    ]
   },
   {
diff --git a/flox/core.py b/flox/core.py
@@ -293,13 +293,16 @@ def find_group_cohorts(
 
     # can happen when `expected_groups` is passed but not all labels are present
     # (binning, resampling)
-    present_labels = chunks_per_label != 0
-    if not present_labels.all():
-        bitmask = bitmask[..., present_labels]
+    present_labels = np.arange(bitmask.shape[LABEL_AXIS])
+    present_labels_mask = chunks_per_label != 0
+    if not present_labels_mask.all():
+        present_labels = present_labels[present_labels_mask]
+        bitmask = bitmask[..., present_labels_mask]
+        chunks_per_label = chunks_per_label[present_labels_mask]
 
     label_chunks = {
-        lab: bitmask.indices[slice(bitmask.indptr[lab], bitmask.indptr[lab + 1])]
-        for lab in range(bitmask.shape[-1])
+        present_labels[idx]: bitmask.indices[slice(bitmask.indptr[idx], bitmask.indptr[idx + 1])]
+        for idx in range(bitmask.shape[LABEL_AXIS])
     }
 
     # Invert the label_chunks mapping so we know which labels occur together.
@@ -334,7 +337,7 @@ def invert(x) -> tuple[np.ndarray, ...]:
     #  - S is the existing set
     MIN_CONTAINMENT = 0.75  # arbitrary
     asfloat = bitmask.astype(float)
-    containment = ((asfloat.T @ asfloat) / chunks_per_label[present_labels]).tocsr()
+    containment = ((asfloat.T @ asfloat) / chunks_per_label).tocsr()
     mask = containment.data < MIN_CONTAINMENT
     containment.data[mask] = 0
     containment.eliminate_zeros()
diff --git a/tests/test_xarray.py b/tests/test_xarray.py
@@ -629,3 +629,30 @@ def test_groupby_2d_dataset():
         expected.counts.dims == actual.counts.dims
     )  # https://github.com/pydata/xarray/issues/8292
     xr.testing.assert_identical(expected, actual)
+
+
+@pytest.mark.parametrize("chunk", (pytest.param(True, marks=requires_dask), False))
+def test_resampling_missing_groups(chunk):
+    # Regression test for https://github.com/pydata/xarray/issues/8592
+    time_coords = pd.to_datetime(
+        ["2018-06-13T03:40:36", "2018-06-13T05:50:37", "2018-06-15T03:02:34"]
+    )
+
+    latitude_coords = [0.0]
+    longitude_coords = [0.0]
+
+    data = [[[1.0]], [[2.0]], [[3.0]]]
+
+    da = xr.DataArray(
+        data,
+        coords={"time": time_coords, "latitude": latitude_coords, "longitude": longitude_coords},
+        dims=["time", "latitude", "longitude"],
+    )
+    if chunk:
+        da = da.chunk(time=1)
+    # Without chunking the dataarray, it works:
+    with xr.set_options(use_flox=False):
+        expected = da.resample(time="1D").mean()
+    with xr.set_options(use_flox=True):
+        actual = da.resample(time="1D").mean()
+    xr.testing.assert_identical(expected, actual)