|
22 | 22 | "outputs": [], |
23 | 23 | "source": [ |
24 | 24 | "import dask.array\n", |
25 | | - "import matplotlib.pyplot as plt\n", |
26 | | - "import numpy as np\n", |
27 | 25 | "import pandas as pd\n", |
28 | 26 | "import xarray as xr\n", |
29 | 27 | "\n", |
|
56 | 54 | "oisst" |
57 | 55 | ] |
58 | 56 | }, |
| 57 | + { |
| 58 | + "cell_type": "markdown", |
| 59 | + "id": "b7f519ee-e575-492c-a70b-8dad63a8c222", |
| 60 | + "metadata": {}, |
| 61 | + "source": [ |
| 62 | + "To account for Feb-29 being present in some years, we'll construct a time vector to group by as \"mmm-dd\" string.\n", |
| 63 | + "\n", |
| 64 | + "For more options, see https://strftime.org/" |
| 65 | + ] |
| 66 | + }, |
| 67 | + { |
| 68 | + "cell_type": "code", |
| 69 | + "execution_count": null, |
| 70 | + "id": "3c42a618-47bc-4c83-a902-ec4cf3420180", |
| 71 | + "metadata": {}, |
| 72 | + "outputs": [], |
| 73 | + "source": [ |
| 74 | + "day = oisst.time.dt.strftime(\"%h-%d\").rename(\"day\")\n", |
| 75 | + "day" |
| 76 | + ] |
| 77 | + }, |
59 | 78 | { |
60 | 79 | "cell_type": "markdown", |
61 | 80 | "id": "6d913e7f-25bd-43c4-98b6-93bcb420c524", |
|
80 | 99 | "source": [ |
81 | 100 | "flox.xarray.xarray_reduce(\n", |
82 | 101 | " oisst,\n", |
83 | | - " oisst.time.dt.dayofyear,\n", |
| 102 | + " day,\n", |
84 | 103 | " func=\"mean\",\n", |
85 | 104 | " method=\"map-reduce\",\n", |
86 | 105 | ")" |
|
106 | 125 | "source": [ |
107 | 126 | "flox.xarray.xarray_reduce(\n", |
108 | 127 | " oisst.chunk({\"lat\": -1, \"lon\": 120}),\n", |
109 | | - " oisst.time.dt.dayofyear,\n", |
| 128 | + " day,\n", |
110 | 129 | " func=\"mean\",\n", |
111 | 130 | " method=\"map-reduce\",\n", |
112 | 131 | ")" |
|
143 | 162 | "source": [ |
144 | 163 | "flox.xarray.xarray_reduce(\n", |
145 | 164 | " oisst,\n", |
146 | | - " oisst.time.dt.dayofyear,\n", |
| 165 | + " day,\n", |
147 | 166 | " func=\"mean\",\n", |
148 | 167 | " method=\"cohorts\",\n", |
149 | 168 | ")" |
|
160 | 179 | "[click here](https://flox.readthedocs.io/en/latest/implementation.html#method-cohorts)).\n", |
161 | 180 | "Now we have the opposite problem: the chunk sizes on the output are too small.\n", |
162 | 181 | "\n", |
163 | | - "Looking more closely, We can see the cohorts that `flox` has detected are not\n", |
164 | | - "really cohorts, each cohort is a single group label. We've replicated Xarray's\n", |
165 | | - "current strategy; what flox calls\n", |
166 | | - "[\"split-reduce\"](https://flox.readthedocs.io/en/latest/implementation.html#method-split-reduce-xarray-s-current-groupby-strategy)\n" |
| 182 | + "Let us inspect the cohorts" |
167 | 183 | ] |
168 | 184 | }, |
169 | 185 | { |
|
173 | 189 | "metadata": {}, |
174 | 190 | "outputs": [], |
175 | 191 | "source": [ |
176 | | - "flox.core.find_group_cohorts(\n", |
177 | | - " labels=oisst.time.dt.dayofyear.data,\n", |
| 192 | + "# integer codes for each \"day\"\n", |
| 193 | + "codes, _ = pd.factorize(day.data)\n", |
| 194 | + "cohorts = flox.core.find_group_cohorts(\n", |
| 195 | + " labels=codes,\n", |
178 | 196 | " chunks=(oisst.chunksizes[\"time\"],),\n", |
179 | | - ").values()" |
| 197 | + ")\n", |
| 198 | + "print(len(cohorts))" |
180 | 199 | ] |
181 | 200 | }, |
182 | 201 | { |
183 | 202 | "cell_type": "markdown", |
184 | | - "id": "bcbdbb3b-2aed-4f3f-ad20-efabb52b5e68", |
| 203 | + "id": "068b4109-b7f4-4c16-918d-9a18ff2ed183", |
185 | 204 | "metadata": {}, |
186 | 205 | "source": [ |
187 | | - "## Rechunking data for cohorts\n", |
188 | | - "\n", |
189 | | - "Can we fix the \"out of phase\" problem by rechunking along time?\n", |
190 | | - "\n", |
191 | | - "First lets see where the current chunk boundaries are\n" |
| 206 | + "Looking more closely, we can see many cohorts with a single entry. " |
192 | 207 | ] |
193 | 208 | }, |
194 | 209 | { |
195 | 210 | "cell_type": "code", |
196 | 211 | "execution_count": null, |
197 | | - "id": "90a884bc-1b71-4874-8143-73b3b5c41458", |
| 212 | + "id": "57983cd0-a2e0-4d16-abe6-9572f6f252bf", |
198 | 213 | "metadata": {}, |
199 | 214 | "outputs": [], |
200 | 215 | "source": [ |
201 | | - "array = oisst.data\n", |
202 | | - "labels = oisst.time.dt.dayofyear.data\n", |
203 | | - "axis = oisst.get_axis_num(\"time\")\n", |
204 | | - "oldchunks = array.chunks[axis]\n", |
205 | | - "oldbreaks = np.insert(np.cumsum(oldchunks), 0, 0)\n", |
206 | | - "labels_at_breaks = labels[oldbreaks[:-1]]\n", |
207 | | - "labels_at_breaks" |
| 216 | + "cohorts.values()" |
208 | 217 | ] |
209 | 218 | }, |
210 | 219 | { |
211 | 220 | "cell_type": "markdown", |
212 | | - "id": "4b2573e5-0d30-4cb8-b5af-751b824f0689", |
| 221 | + "id": "bcbdbb3b-2aed-4f3f-ad20-efabb52b5e68", |
213 | 222 | "metadata": {}, |
214 | 223 | "source": [ |
215 | | - "Now we'll use a convenient function `rechunk_for_cohorts` to rechunk the `oisst`\n", |
216 | | - "dataset along time. We'll ask it to rechunk so that a new chunk starts at each\n", |
217 | | - "of the elements\n", |
| 224 | + "## Rechunking data for cohorts\n", |
218 | 225 | "\n", |
219 | | - "```\n", |
220 | | - "[244, 264, 284, 304, 324, 344, 364, 19, 39, 59, 79, 99, 119,\n", |
221 | | - " 139, 159, 179, 199, 219, 239]\n", |
222 | | - "```\n", |
| 226 | + "Can we fix the \"out of phase\" problem by rechunking along time?\n", |
223 | 227 | "\n", |
224 | | - "These are labels at the chunk boundaries in the first year of data. We are\n", |
225 | | - "forcing that chunking pattern to repeat as much as possible. We also tell the\n", |
226 | | - "function to ignore any existing chunk boundaries.\n" |
| 228 | + "First lets see where the current chunk boundaries are" |
227 | 229 | ] |
228 | 230 | }, |
229 | 231 | { |
230 | 232 | "cell_type": "code", |
231 | 233 | "execution_count": null, |
232 | | - "id": "a9ab6382-e93b-49e9-8e2e-1ba526046aea", |
| 234 | + "id": "40d393a5-7a4e-4d33-997b-4c422a0b8100", |
233 | 235 | "metadata": {}, |
234 | 236 | "outputs": [], |
235 | 237 | "source": [ |
236 | | - "rechunked = flox.xarray.rechunk_for_cohorts(\n", |
237 | | - " oisst,\n", |
238 | | - " dim=\"time\",\n", |
239 | | - " labels=oisst.time.dt.dayofyear,\n", |
240 | | - " force_new_chunk_at=[\n", |
241 | | - " 244,\n", |
242 | | - " 264,\n", |
243 | | - " 284,\n", |
244 | | - " 304,\n", |
245 | | - " 324,\n", |
246 | | - " 344,\n", |
247 | | - " 364,\n", |
248 | | - " 19,\n", |
249 | | - " 39,\n", |
250 | | - " 59,\n", |
251 | | - " 79,\n", |
252 | | - " 99,\n", |
253 | | - " 119,\n", |
254 | | - " 139,\n", |
255 | | - " 159,\n", |
256 | | - " 179,\n", |
257 | | - " 199,\n", |
258 | | - " 219,\n", |
259 | | - " 239,\n", |
260 | | - " ],\n", |
261 | | - " ignore_old_chunks=True,\n", |
262 | | - ")\n", |
263 | | - "rechunked" |
| 238 | + "oisst.chunksizes[\"time\"][:10]" |
264 | 239 | ] |
265 | 240 | }, |
266 | 241 | { |
267 | 242 | "cell_type": "markdown", |
268 | | - "id": "570d869b-9612-4de9-83ee-336a35c1fdad", |
| 243 | + "id": "cd0033a3-d211-4aef-a284-c9fd3f75f6e4", |
| 244 | + "metadata": {}, |
| 245 | + "source": [ |
| 246 | + "We'll choose to rechunk such that a single month in is a chunk. This is not too different from the current chunking but will help your periodicity problem" |
| 247 | + ] |
| 248 | + }, |
| 249 | + { |
| 250 | + "cell_type": "code", |
| 251 | + "execution_count": null, |
| 252 | + "id": "5914a350-a7db-49b3-9504-6d63ff874f5e", |
269 | 253 | "metadata": {}, |
| 254 | + "outputs": [], |
270 | 255 | "source": [ |
271 | | - "We see that chunks are mostly 20 elements long in time with some differences\n" |
| 256 | + "newchunks = xr.ones_like(day).astype(int).resample(time=\"M\").count()" |
272 | 257 | ] |
273 | 258 | }, |
274 | 259 | { |
275 | 260 | "cell_type": "code", |
276 | 261 | "execution_count": null, |
277 | | - "id": "86bb4461-d921-40f8-9ff7-8d6e7e8c7e4b", |
| 262 | + "id": "90a884bc-1b71-4874-8143-73b3b5c41458", |
278 | 263 | "metadata": {}, |
279 | 264 | "outputs": [], |
280 | 265 | "source": [ |
281 | | - "plt.plot(rechunked.chunksizes[\"time\"], marker=\"x\", ls=\"none\")" |
| 266 | + "rechunked = oisst.chunk(time=tuple(newchunks.data))" |
282 | 267 | ] |
283 | 268 | }, |
284 | 269 | { |
|
296 | 281 | "metadata": {}, |
297 | 282 | "outputs": [], |
298 | 283 | "source": [ |
299 | | - "flox.core.find_group_cohorts(\n", |
300 | | - " labels=rechunked.time.dt.dayofyear.data,\n", |
| 284 | + "new_cohorts = flox.core.find_group_cohorts(\n", |
| 285 | + " labels=codes,\n", |
301 | 286 | " chunks=(rechunked.chunksizes[\"time\"],),\n", |
302 | | - ").values()" |
| 287 | + ")\n", |
| 288 | + "# one cohort per month!\n", |
| 289 | + "len(new_cohorts)" |
| 290 | + ] |
| 291 | + }, |
| 292 | + { |
| 293 | + "cell_type": "code", |
| 294 | + "execution_count": null, |
| 295 | + "id": "4e2b6f70-c057-4783-ad55-21b20ff27e7f", |
| 296 | + "metadata": {}, |
| 297 | + "outputs": [], |
| 298 | + "source": [ |
| 299 | + "new_cohorts.values()" |
303 | 300 | ] |
304 | 301 | }, |
305 | 302 | { |
|
318 | 315 | "metadata": {}, |
319 | 316 | "outputs": [], |
320 | 317 | "source": [ |
321 | | - "flox.xarray.xarray_reduce(rechunked, rechunked.time.dt.dayofyear, func=\"mean\", method=\"cohorts\")" |
| 318 | + "flox.xarray.xarray_reduce(rechunked, day, func=\"mean\", method=\"cohorts\")" |
322 | 319 | ] |
323 | 320 | }, |
324 | 321 | { |
|
0 commit comments