open-meteo · terraputix · Mar 25, 2026 · Mar 9, 2026 · Mar 9, 2026 · Mar 9, 2026
diff --git a/pyproject.toml b/pyproject.toml
@@ -37,13 +37,15 @@ codec = [
 xarray = ["xarray>=2023.1.0"]
 fsspec = ["fsspec>=2023.1.0", "s3fs>=2023.1.0"]
 grids = ["pyproj>=3.1.0"]
+dask = ["dask[array]>=2023.1.0"]
 all = [
     "zarr>=2.18.2",
     "numcodecs>=0.12.1",
     "xarray>=2023.1.0",
     "fsspec>=2023.10.0",
     "s3fs>=2023.1.0",
-    "pyproj>=3.3.0"
+    "pyproj>=3.3.0",
+    "dask[array]>=2023.1.0",
 ]
 
 [dependency-groups]

diff --git a/python/omfiles/_rust/__init__.pyi b/python/omfiles/_rust/__init__.pyi
@@ -573,6 +573,46 @@ class OmFileWriter:
         Raises:
             ValueError: If the data type is unsupported or if parameters are invalid
         """
+    def write_array_streaming(
+        self,
+        dimensions: typing.Sequence[builtins.int],
+        chunks: typing.Sequence[builtins.int],
+        chunk_iterator: typing.Iterator,
+        dtype: numpy.dtype,
+        scale_factor: typing.Optional[builtins.float] = None,
+        add_offset: typing.Optional[builtins.float] = None,
+        compression: typing.Optional[builtins.str] = None,
+        name: typing.Optional[builtins.str] = None,
+        children: typing.Optional[typing.Sequence[OmVariable]] = None,
+    ) -> OmVariable:
+        r"""
+        Write an array to the .om file by streaming chunks from a Python iterator.
+
+        This method is designed for writing large arrays that do not fit in memory.
+        Instead of providing the full array, you provide the full array dimensions
+        and an iterator that yields numpy array chunks.
+
+        Chunks MUST be yielded in row-major order (C-order) of the chunk grid.
+        Each chunk's shape determines how many internal file chunks it covers.
+
+        Args:
+            dimensions: Shape of the full array (e.g., [1000, 2000])
+            chunks: Chunk sizes for each dimension (e.g., [100, 200])
+            chunk_iterator: Python iterable yielding numpy arrays, one per chunk region
+            dtype: Numpy dtype of the array (e.g., np.dtype(np.float32))
+            scale_factor: Scale factor for data compression (default: 1.0)
+            add_offset: Offset value for data compression (default: 0.0)
+            compression: Compression algorithm to use (default: "pfor_delta_2d")
+            name: Name of the variable (default: "data")
+            children: List of child variables (default: [])
+
+        Returns:
+            :py:data:`omfiles.OmVariable` representing the written array in the file structure
+
+        Raises:
+            ValueError: If the dtype is unsupported or parameters are invalid
+            RuntimeError: If there's an error during compression or I/O
+        """
     def write_scalar(
         self, value: typing.Any, name: builtins.str, children: typing.Optional[typing.Sequence[OmVariable]] = None
     ) -> OmVariable:

diff --git a/python/omfiles/dask.py b/python/omfiles/dask.py
@@ -0,0 +1,139 @@
+"""Dask array integration for writing to OM files."""
+
+import math
+from typing import Iterator, Optional, Sequence
+
+import numpy as np
+
+from omfiles._rust import OmFileWriter, OmVariable
+
+try:
+    import dask.array as da
+except ImportError:
+    raise ImportError("omfiles[dask] is required for dask functionality")
+
+
+def _validate_chunk_alignment(
+    data_chunks: tuple,
+    om_chunks: list[int],
+    array_shape: tuple,
+) -> None:
+    """
+    Validate dask chunks are compatible with OM chunks for block-level streaming.
+
+    Every non-last dask chunk along each dimension must be an exact multiple
+    of the corresponding OM chunk size (the last chunk may be smaller).
+    Additionally, for the leftmost dimension where a dask block contains more
+    than one OM chunk, every trailing dimension must be fully covered by each
+    dask block. This ensures the local chunk traversal inside a block matches
+    the global file order.
+    """
+    ndim = len(om_chunks)
+
+    for d in range(ndim):
+        dim_chunks = data_chunks[d]
+        for i, c in enumerate(dim_chunks[:-1]):
+            if c % om_chunks[d] != 0:
+                raise ValueError(
+                    f"Dask chunk size {c} along dimension {d} (block {i}) "
+                    f"is not a multiple of the OM chunk size {om_chunks[d]}."
+                )
+
+    first_multi = None
+    for d in range(ndim):
+        local_n = max(math.ceil(c / om_chunks[d]) for c in data_chunks[d])
+        if local_n > 1:
+            first_multi = d
+            break
+
+    if first_multi is not None:
+        for d in range(first_multi + 1, ndim):
+            dim_chunks = data_chunks[d]
+            if not (len(dim_chunks) == 1 and dim_chunks[0] == array_shape[d]):
+                raise ValueError(
+                    f"Dask blocks have multiple OM chunks in dimension {first_multi}, "
+                    f"but dimension {d} is not fully covered by each dask block "
+                    f"(dask chunks {dim_chunks} vs array size {array_shape[d]}). "
+                    f"Rechunk so trailing dimensions are fully covered."
+                )
+
+
+def _dask_block_iterator(dask_array: da.Array) -> Iterator[np.ndarray]:
+    """
+    Yield computed numpy arrays from a dask array in C-order block traversal.
+
+    The OM file format requires chunks to be written in sequential order
+    corresponding to a row-major (C-order) traversal of the chunk grid.
+    np.ndindex yields indices in C-order: the last axis index varies fastest.
+    """
+    for block_indices in np.ndindex(*dask_array.numblocks):
+        yield dask_array.blocks[block_indices].compute()
+
+
+def write_dask_array(
+    writer: OmFileWriter,
+    data: da.Array,
+    chunks: Optional[Sequence[int]] = None,
+    scale_factor: float = 1.0,
+    add_offset: float = 0.0,
+    compression: str = "pfor_delta_2d",
+    name: str = "data",
+    children: Optional[Sequence[OmVariable]] = None,
+) -> OmVariable:
+    """
+    Write a dask array to an OM file using streaming/incremental writes.
+
+    Iterates over the blocks of the dask array, computing each block
+    on-the-fly, and streams them to the OM file writer. Only one block
+    is held in memory at a time.
+
+    The dask array's chunk structure is used to determine the OM file's
+    chunk dimensions by default. Dask chunks must be multiples of the OM
+    chunk sizes (except the last chunk along each dimension which may be
+    smaller). When a dask block contains more than one OM chunk in a
+    dimension, all trailing dimensions must be fully covered by each block.
+
+    Performance: write speed depends on the number of dask tasks, not just
+    data size. For best performance, use dask chunks much larger than the
+    OM chunk sizes — ideally covering the full extent of trailing dimensions.
+    For example, with OM chunks of (124, 124) on an (8192, 8192) array,
+    dask chunks of (124, 8192) will write ~10x faster than (124, 124).
+
+    Args:
+        writer: An open OmFileWriter instance.
+        data: A dask array to write.
+        chunks: OM file chunk sizes per dimension. If None, uses the dask
+                array's chunk sizes. Dask chunks must be multiples of these.
+        scale_factor: Scale factor for float compression (default: 1.0).
+        add_offset: Offset for float compression (default: 0.0).
+        compression: Compression algorithm (default: "pfor_delta_2d").
+        name: Variable name (default: "data").
+        children: Child variables (default: None).
+
+    Returns:
+        OmVariable representing the written array.
+
+    Raises:
+        TypeError: If data is not a dask array.
+        ValueError: If dask chunks are incompatible with OM chunks.
+    """
+    if not isinstance(data, da.Array):
+        raise TypeError(f"Expected a dask array, got {type(data)}")
+
+    if chunks is not None and len(chunks) != data.ndim:
+        raise ValueError(f"chunks has {len(chunks)} element(s) but data has {data.ndim} dimension(s).")
+
+    om_chunks: list[int] = list(chunks) if chunks is not None else [c[0] for c in data.chunks]
+    _validate_chunk_alignment(data.chunks, om_chunks, data.shape)
+
+    return writer.write_array_streaming(
+        dimensions=[int(d) for d in data.shape],
+        chunks=om_chunks,
+        chunk_iterator=_dask_block_iterator(data),
+        dtype=data.dtype,
+        scale_factor=scale_factor,
+        add_offset=add_offset,
+        compression=compression,
+        name=name,
+        children=list(children) if children is not None else [],
+    )