Skip to content

Commit 3575aac

Browse files
committed
perf(eda.correlation): increase the performance
1 parent 2735787 commit 3575aac

File tree

2 files changed

+19
-12
lines changed

2 files changed

+19
-12
lines changed

dataprep/eda/correlation/compute/common.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ def rankdata(data: np.ndarray, axis: int = 0) -> np.ndarray:
2828
name="rankdata-bottleneck", pure=True
2929
)
3030
def nanrankdata(data: np.ndarray, axis: int = 0) -> np.ndarray:
31-
"""delayed version of rankdata"""
31+
"""delayed version of rankdata."""
3232
return nanrankdata_(data, axis=axis)
3333

3434

@@ -38,6 +38,13 @@ def nanrankdata(data: np.ndarray, axis: int = 0) -> np.ndarray:
3838
def kendalltau( # pylint: disable=invalid-name
3939
a: np.ndarray, b: np.ndarray
4040
) -> np.ndarray:
41-
"""delayed version of kendalltau"""
41+
"""delayed version of kendalltau."""
4242
corr = kendalltau_(a, b).correlation
4343
return np.float64(corr) # Sometimes corr is a float, causes dask error
44+
45+
46+
@dask.delayed
47+
def corrcoef(arr: np.ndarray) -> np.ndarray:
48+
"""delayed version of np.corrcoef."""
49+
_, (corr, _) = np.corrcoef(arr, rowvar=False)
50+
return corr

dataprep/eda/correlation/compute/univariate.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,9 @@
99
import numpy as np
1010
import pandas as pd
1111

12-
from ...intermediate import Intermediate
1312
from ...data_array import DataArray
14-
from .common import CorrelationMethod, kendalltau, nanrankdata
13+
from ...intermediate import Intermediate
14+
from .common import CorrelationMethod, kendalltau, nanrankdata, corrcoef
1515

1616

1717
def _calc_univariate(
@@ -74,17 +74,17 @@ def _calc_univariate(
7474
def _pearson_1xn(x: da.Array, data: da.Array) -> da.Array:
7575
_, ncols = data.shape
7676

77-
datamask = da.isnan(data)
78-
xmask = da.isnan(x)[:, 0]
77+
fused = da.concatenate([data, x], axis=1)
78+
mask = ~da.isnan(data)
7979

8080
corrs = []
8181
for j in range(ncols):
82-
y = data[:, [j]]
83-
84-
mask = ~(xmask | datamask[:, j])
85-
xy = np.concatenate([x, y], axis=1)[mask]
86-
xy.compute_chunk_sizes() # Not optimal here
87-
_, (corr, _) = da.corrcoef(xy, rowvar=False)
82+
xy = fused[:, [-1, j]]
83+
mask_ = mask[:, -1] & mask[:, j]
84+
xy = xy[mask_]
85+
corr = da.from_delayed(corrcoef(xy), dtype=np.float, shape=())
86+
# not usable because xy has unknown rows due to the null filter
87+
# _, (corr, _) = da.corrcoef(xy, rowvar=False)
8888
corrs.append(corr)
8989

9090
return da.stack(corrs)

0 commit comments

Comments
 (0)