-
-
Notifications
You must be signed in to change notification settings - Fork 5.6k
ENH: stats.quantile: add support for frequency weights #23941
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 16 commits
f82b9fa
915083b
8d7b0e5
ff80c06
133d5b4
909d13f
3fd961f
f3f1b86
1684ef4
2a30214
b7b13b2
cb2ba66
8f7c229
6dbe8b9
5c57a5c
760eff3
22ac4bf
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -14,17 +14,20 @@ | |
| from scipy.stats._axis_nan_policy import _broadcast_arrays, _contains_nan | ||
|
|
||
|
|
||
| def _quantile_iv(x, p, method, axis, nan_policy, keepdims): | ||
| xp = array_namespace(x, p) | ||
| def _quantile_iv(x, p, method, axis, nan_policy, keepdims, weights): | ||
| xp = array_namespace(x, p, weights) | ||
|
|
||
| if not xp.isdtype(xp.asarray(x).dtype, ('integral', 'real floating')): | ||
| raise ValueError("`x` must have real dtype.") | ||
|
|
||
| if not xp.isdtype(xp.asarray(p).dtype, 'real floating'): | ||
| raise ValueError("`p` must have real floating dtype.") | ||
|
|
||
| x, p = xp_promote(x, p, force_floating=True, xp=xp) | ||
| p = xp.asarray(p, device=xp_device(x)) | ||
| if not (weights is None | ||
| or xp.isdtype(xp.asarray(weights).dtype, ('integral', 'real floating'))): | ||
| raise ValueError("`weights` must have real dtype.") | ||
|
|
||
| x, p, weights = xp_promote(x, p, weights, force_floating=True, xp=xp) | ||
| dtype = x.dtype | ||
|
|
||
| axis_none = axis is None | ||
|
|
@@ -50,6 +53,12 @@ def _quantile_iv(x, p, method, axis, nan_policy, keepdims): | |
| message = f"`method` must be one of {methods}" | ||
| raise ValueError(message) | ||
|
|
||
| no_weights = {'_lower', '_midpoint', '_higher', '_nearest', 'harrell-davis', | ||
| 'round_nearest', 'round_inward', 'round_outward'} | ||
| if weights is not None and method in no_weights: | ||
| message = f"`method='{method}'` does not support `weights`." | ||
| raise ValueError(message) | ||
|
|
||
| contains_nans = _contains_nan(x, nan_policy, xp_omit_okay=True, xp=xp) | ||
|
|
||
| if keepdims not in {None, True, False}: | ||
|
|
@@ -64,8 +73,20 @@ def _quantile_iv(x, p, method, axis, nan_policy, keepdims): | |
| shape[axis] = 1 | ||
| x = xp.full(shape, xp.nan, dtype=dtype, device=xp_device(x)) | ||
|
|
||
| y = xp.sort(x, axis=axis, stable=False) | ||
| y, p = _broadcast_arrays((y, p), axis=axis) | ||
| if weights is None: | ||
| y = xp.sort(x, axis=axis, stable=False) | ||
| y, p = _broadcast_arrays((y, p), axis=axis) | ||
| n_zero_weight = None | ||
| else: | ||
| x, weights = xp.broadcast_arrays(x, weights) | ||
| i_zero_weight = (weights == 0) | ||
| n_zero_weight = xp.count_nonzero(i_zero_weight, axis=axis, keepdims=True) | ||
mdhaber marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| x = xpx.at(x)[i_zero_weight].set(xp.inf, copy=True) | ||
| i_y = xp.argsort(x, axis=axis, stable=False) | ||
| y = xp.take_along_axis(x, i_y, axis=axis) | ||
| weights = xp.take_along_axis(weights, i_y, axis=axis) | ||
| y, p, weights, i_y, n_zero_weight = _broadcast_arrays( | ||
| (y, p, weights, i_y, n_zero_weight), axis=axis) | ||
|
|
||
| if (keepdims is False) and (p.shape[axis] != 1): | ||
| message = "`keepdims` may be False only if the length of `p` along `axis` is 1." | ||
|
|
@@ -74,9 +95,13 @@ def _quantile_iv(x, p, method, axis, nan_policy, keepdims): | |
|
|
||
| y = xp.moveaxis(y, axis, -1) | ||
| p = xp.moveaxis(p, axis, -1) | ||
| weights = weights if weights is None else xp.moveaxis(weights, axis, -1) | ||
| n_zero_weight = (n_zero_weight if n_zero_weight is None | ||
| else xp.moveaxis(n_zero_weight, axis, -1)) | ||
|
|
||
| n = _length_nonmasked(y, -1, xp=xp, keepdims=True) | ||
| n = xp.asarray(n, dtype=dtype, device=xp_device(y)) | ||
| n = n if n_zero_weight is None else n - n_zero_weight | ||
|
|
||
| if contains_nans: | ||
| nans = xp.isnan(y) | ||
|
|
||
|
|
@@ -85,8 +110,7 @@ def _quantile_iv(x, p, method, axis, nan_policy, keepdims): | |
| if nan_policy == 'propagate': | ||
| nan_out = xp.any(nans, axis=-1) | ||
| else: # 'omit' | ||
| non_nan = xp.astype(~nans, xp.uint64) | ||
| n_int = xp.sum(non_nan, axis=-1, keepdims=True) | ||
| n_int = n - xp.count_nonzero(nans, axis=-1, keepdims=True) | ||
| n = xp.astype(n_int, dtype) | ||
| # NaNs are produced only if slice is empty after removing NaNs | ||
| nan_out = xp.any(n == 0, axis=-1) | ||
|
|
@@ -99,17 +123,21 @@ def _quantile_iv(x, p, method, axis, nan_policy, keepdims): | |
| y = xp.asarray(y, copy=True) # ensure writable | ||
| y = xpx.at(y, nans).set(0) # any non-nan will prevent NaN from propagating | ||
|
|
||
| n = xp.asarray(n, dtype=dtype, device=xp_device(y)) | ||
|
|
||
| p_mask = (p > 1) | (p < 0) | xp.isnan(p) | ||
| if xp.any(p_mask): | ||
| p = xp.asarray(p, copy=True) | ||
| p = xpx.at(p, p_mask).set(0.5) # these get NaN-ed out at the end | ||
|
|
||
| return y, p, method, axis, nan_policy, keepdims, n, axis_none, ndim, p_mask, xp | ||
| return (y, p, method, axis, nan_policy, keepdims, | ||
| n, axis_none, ndim, p_mask, weights, xp) | ||
|
|
||
|
|
||
| @xp_capabilities(skip_backends=[("dask.array", "No take_along_axis yet.")], | ||
| jax_jit=False) | ||
| def quantile(x, p, *, method='linear', axis=0, nan_policy='propagate', keepdims=None): | ||
| def quantile(x, p, *, method='linear', axis=0, nan_policy='propagate', keepdims=None, | ||
| weights=None): | ||
| """ | ||
| Compute the p-th quantile of the data along the specified axis. | ||
|
|
||
|
|
@@ -179,10 +207,17 @@ def quantile(x, p, *, method='linear', axis=0, nan_policy='propagate', keepdims= | |
| axis to contain the number of quantiles given by ``p.size``. Therefore: | ||
|
|
||
| - By default, the axis will be reduced away if possible (i.e. if there is | ||
| exactly one element of `q` per axis-slice of `x`). | ||
| exactly one element of `p` per axis-slice of `x`). | ||
| - If `keepdims` is set to True, the axis will not be reduced away. | ||
| - If `keepdims` is set to False, the axis will be reduced away | ||
| if possible, and an error will be raised otherwise. | ||
| weights : array_like of finite, non-negative real numbers | ||
| Frequency weights; e.g., for counting number weights, | ||
| ``quantile(x, p, weights=weights)`` is equivalent to | ||
| ``quantile(np.repeat(x, weights), p)``. Values other than finite counting | ||
| numbers are accepted, but may not have valid statistical interpretations. | ||
| Not compatible with ``method='harrell-davis'`` or those that begin with | ||
| ``'round_'``. | ||
|
|
||
| Returns | ||
| ------- | ||
|
|
@@ -245,6 +280,11 @@ def quantile(x, p, *, method='linear', axis=0, nan_policy='propagate', keepdims= | |
| 3. ``closest_observation``: ``m = -1/2`` and | ||
| ``g = 1 - int((index == j) & (j%2 == 1))`` | ||
|
|
||
| Note that for methods ``inverted_cdf`` and ``averaged_inverted_cdf``, only the | ||
| relative proportions of tied observations (and relative weights) affect the | ||
| results; for all other methods, the total number of observations (and absolute | ||
| weights) matter. | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was thinking that if I were to include I think I'll be asked to include the information about coverage conditions, etc., that appears in the NumPy documentation, so I'll address that now: I would prefer not to, but I can add references. I don't think the math underlying the weights needs to be spelled out because it's exactly the same math as repeated samples; it's only the implementation that is generalized. Now, that same generalized implementation happens to work for non-integer weights, too, so if there's a reference for valid statistical interpretations of non-integral weights, I'd be interested in including that. |
||
|
|
||
| A different strategy for computing quantiles from [2]_, ``method='harrell-davis'``, | ||
| uses a weighted combination of all elements. The weights are computed as: | ||
|
|
||
|
|
@@ -326,18 +366,19 @@ def quantile(x, p, *, method='linear', axis=0, nan_policy='propagate', keepdims= | |
| """ | ||
| # Input validation / standardization | ||
|
|
||
| temp = _quantile_iv(x, p, method, axis, nan_policy, keepdims) | ||
| y, p, method, axis, nan_policy, keepdims, n, axis_none, ndim, p_mask, xp = temp | ||
| temp = _quantile_iv(x, p, method, axis, nan_policy, keepdims, weights) | ||
| (y, p, method, axis, nan_policy, keepdims, | ||
| n, axis_none, ndim, p_mask, weights, xp) = temp | ||
|
|
||
| if method in {'inverted_cdf', 'averaged_inverted_cdf', 'closest_observation', | ||
| 'hazen', 'interpolated_inverted_cdf', 'linear', | ||
| 'median_unbiased', 'normal_unbiased', 'weibull'}: | ||
| res = _quantile_hf(y, p, n, method, xp) | ||
| res = _quantile_hf(y, p, n, method, weights, xp) | ||
| elif method in {'harrell-davis'}: | ||
| res = _quantile_hd(y, p, n, xp) | ||
| elif method in {'_lower', '_midpoint', '_higher', '_nearest'}: | ||
| res = _quantile_bc(y, p, n, method, xp) | ||
| else: # method.startswith('winsor'): | ||
| else: # method.startswith('round'): | ||
| res = _quantile_winsor(y, p, n, method, xp) | ||
|
|
||
| res = xpx.at(res, p_mask).set(xp.nan) | ||
|
|
@@ -356,13 +397,26 @@ def quantile(x, p, *, method='linear', axis=0, nan_policy='propagate', keepdims= | |
| return res[()] if res.ndim == 0 else res | ||
|
|
||
|
|
||
| def _quantile_hf(y, p, n, method, xp): | ||
| def _quantile_hf(y, p, n, method, weights, xp): | ||
| ms = dict(inverted_cdf=0, averaged_inverted_cdf=0, closest_observation=-0.5, | ||
| interpolated_inverted_cdf=0, hazen=0.5, weibull=p, linear=1 - p, | ||
| median_unbiased=p/3 + 1/3, normal_unbiased=p/4 + 3/8) | ||
| m = ms[method] | ||
| jg = p*n + m - 1 | ||
| j = jg // 1 | ||
|
|
||
| if weights is None: | ||
| jg = p * n + m | ||
| jp1 = jg // 1 | ||
| j = jp1 - 1 | ||
| else: | ||
| cumulative_weights = xp.cumulative_sum(weights, axis=-1) | ||
| n_int = xp.asarray(n, dtype=xp.int64) | ||
| n_int = xp.broadcast_to(n_int, cumulative_weights.shape[:-1] + (1,)) | ||
| total_weight = xp.take_along_axis(cumulative_weights, n_int-1, axis=-1) | ||
|
Comment on lines
+413
to
+415
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For |
||
| jg = p * total_weight + m | ||
| jp1 = _xp_searchsorted(cumulative_weights, jg, side='right') | ||
| j = _xp_searchsorted(cumulative_weights, jg-1, side='right') | ||
| j, jp1 = xp.astype(j, y.dtype), xp.astype(jp1, y.dtype) | ||
|
|
||
| g = jg % 1 | ||
| if method == 'inverted_cdf': | ||
| g = xp.astype((g > 0), jg.dtype) | ||
|
|
@@ -376,7 +430,7 @@ def _quantile_hf(y, p, n, method, xp): | |
|
|
||
| g = xpx.at(g)[j < 0].set(0) | ||
| j = xp.clip(j, 0., n - 1) | ||
| jp1 = xp.clip(j + 1, 0., n - 1) | ||
| jp1 = xp.clip(jp1, 0., n - 1) | ||
|
|
||
| return ((1 - g) * xp.take_along_axis(y, xp.astype(j, xp.int64), axis=-1) | ||
| + g * xp.take_along_axis(y, xp.astype(jp1, xp.int64), axis=-1)) | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.