Add xrdtypes.py!

dcherian · dcherian · commit bf2402b094c6 · 2021-10-05T14:59:42.000+05:30
diff --git a/dask_groupby/xrdtypes.py b/dask_groupby/xrdtypes.py
@@ -0,0 +1,179 @@
+import functools
+
+import numpy as np
+
+from . import xrutils as utils
+
+# Use as a sentinel value to indicate a dtype appropriate NA value.
+NA = utils.ReprObject("<NA>")
+
+
+@functools.total_ordering
+class AlwaysGreaterThan:
+    def __gt__(self, other):
+        return True
+
+    def __eq__(self, other):
+        return isinstance(other, type(self))
+
+
+@functools.total_ordering
+class AlwaysLessThan:
+    def __lt__(self, other):
+        return True
+
+    def __eq__(self, other):
+        return isinstance(other, type(self))
+
+
+# Equivalence to np.inf (-np.inf) for object-type
+INF = AlwaysGreaterThan()
+NINF = AlwaysLessThan()
+
+
+# Pairs of types that, if both found, should be promoted to object dtype
+# instead of following NumPy's own type-promotion rules. These type promotion
+# rules match pandas instead. For reference, see the NumPy type hierarchy:
+# https://docs.scipy.org/doc/numpy-1.13.0/reference/arrays.scalars.html
+PROMOTE_TO_OBJECT = [
+    {np.number, np.character},  # numpy promotes to character
+    {np.bool_, np.character},  # numpy promotes to character
+    {np.bytes_, np.unicode_},  # numpy promotes to unicode
+]
+
+
+def maybe_promote(dtype):
+    """Simpler equivalent of pandas.core.common._maybe_promote
+
+    Parameters
+    ----------
+    dtype : np.dtype
+
+    Returns
+    -------
+    dtype : Promoted dtype that can hold missing values.
+    fill_value : Valid missing value for the promoted dtype.
+    """
+    # N.B. these casting rules should match pandas
+    if np.issubdtype(dtype, np.floating):
+        fill_value = np.nan
+    elif np.issubdtype(dtype, np.timedelta64):
+        # See https://github.com/numpy/numpy/issues/10685
+        # np.timedelta64 is a subclass of np.integer
+        # Check np.timedelta64 before np.integer
+        fill_value = np.timedelta64("NaT")
+    elif np.issubdtype(dtype, np.integer):
+        dtype = np.float32 if dtype.itemsize <= 2 else np.float64
+        fill_value = np.nan
+    elif np.issubdtype(dtype, np.complexfloating):
+        fill_value = np.nan + np.nan * 1j
+    elif np.issubdtype(dtype, np.datetime64):
+        fill_value = np.datetime64("NaT")
+    else:
+        dtype = object
+        fill_value = np.nan
+    return np.dtype(dtype), fill_value
+
+
+NAT_TYPES = {np.datetime64("NaT").dtype, np.timedelta64("NaT").dtype}
+
+
+def get_fill_value(dtype):
+    """Return an appropriate fill value for this dtype.
+
+    Parameters
+    ----------
+    dtype : np.dtype
+
+    Returns
+    -------
+    fill_value : Missing value corresponding to this dtype.
+    """
+    _, fill_value = maybe_promote(dtype)
+    return fill_value
+
+
+def get_pos_infinity(dtype, max_for_int=False):
+    """Return an appropriate positive infinity for this dtype.
+
+    Parameters
+    ----------
+    dtype : np.dtype
+    max_for_int : bool
+        Return np.iinfo(dtype).max instead of np.inf
+
+    Returns
+    -------
+    fill_value : positive infinity value corresponding to this dtype.
+    """
+    if issubclass(dtype.type, np.floating):
+        return np.inf
+
+    if issubclass(dtype.type, np.integer):
+        if max_for_int:
+            return np.iinfo(dtype).max
+        else:
+            return np.inf
+
+    if issubclass(dtype.type, np.complexfloating):
+        return np.inf + 1j * np.inf
+
+    return INF
+
+
+def get_neg_infinity(dtype, min_for_int=False):
+    """Return an appropriate positive infinity for this dtype.
+
+    Parameters
+    ----------
+    dtype : np.dtype
+    min_for_int : bool
+        Return np.iinfo(dtype).min instead of -np.inf
+
+    Returns
+    -------
+    fill_value : positive infinity value corresponding to this dtype.
+    """
+    if issubclass(dtype.type, np.floating):
+        return -np.inf
+
+    if issubclass(dtype.type, np.integer):
+        if min_for_int:
+            return np.iinfo(dtype).min
+        else:
+            return -np.inf
+
+    if issubclass(dtype.type, np.complexfloating):
+        return -np.inf - 1j * np.inf
+
+    return NINF
+
+
+def is_datetime_like(dtype):
+    """Check if a dtype is a subclass of the numpy datetime types"""
+    return np.issubdtype(dtype, np.datetime64) or np.issubdtype(dtype, np.timedelta64)
+
+
+def result_type(*arrays_and_dtypes):
+    """Like np.result_type, but with type promotion rules matching pandas.
+
+    Examples of changed behavior:
+    number + string -> object (not string)
+    bytes + unicode -> object (not unicode)
+
+    Parameters
+    ----------
+    *arrays_and_dtypes : list of arrays and dtypes
+        The dtype is extracted from both numpy and dask arrays.
+
+    Returns
+    -------
+    numpy.dtype for the result.
+    """
+    types = {np.result_type(t).type for t in arrays_and_dtypes}
+
+    for left, right in PROMOTE_TO_OBJECT:
+        if any(issubclass(t, left) for t in types) and any(issubclass(t, right) for t in types):
+            return np.dtype(object)
+
+    return np.result_type(*arrays_and_dtypes)