From fe6a8a04bc112e22ed8b2e9855bcab9433a0300e Mon Sep 17 00:00:00 2001 From: Ian Rose Date: Tue, 1 Nov 2022 14:44:07 -0700 Subject: [PATCH 1/2] Add test for shuffling on different string dtypes --- tests/benchmarks/test_dataframe.py | 32 ++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/tests/benchmarks/test_dataframe.py b/tests/benchmarks/test_dataframe.py index 4d4f5ce00d..f033a9b6fc 100644 --- a/tests/benchmarks/test_dataframe.py +++ b/tests/benchmarks/test_dataframe.py @@ -1,3 +1,10 @@ +import random +import string + +import dask.dataframe as dd +import numpy +import pandas +import pytest from dask.sizeof import sizeof from dask.utils import format_bytes @@ -58,3 +65,28 @@ def test_shuffle(small_client): shuf = df.shuffle(0, shuffle="tasks") result = shuf.size wait(result, small_client, 20 * 60) + + +@pytest.mark.parametrize("dtype", ["object", "string[python]", "string[pyarrow]"]) +def test_shuffle_string(dtype, small_client): + def make_partition(s, n=100_000): + random.seed(s) + s1 = pandas.Series( + [ + "".join( + random.choices(string.ascii_letters, k=random.randint(100, 1000)) + ) + for _ in range(n) + ], + dtype=dtype, + name="label", + ) + df = pandas.DataFrame(numpy.random.randint(0, 100, size=(n, 10))) + df.insert(0, "label", s1) + return df + + meta = make_partition(0, n=10) + + ddf = dd.from_map(make_partition, range(100), meta=meta) + result = ddf.set_index("label").persist() + wait(result, small_client, 20 * 60) From 94d13e33b1fb6aa9c3609503274a21f66c9fb13f Mon Sep 17 00:00:00 2001 From: ncclementi Date: Wed, 16 Nov 2022 14:43:36 -0500 Subject: [PATCH 2/2] pandas dtypes not supported in <2022.10.1 --- tests/benchmarks/test_dataframe.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/benchmarks/test_dataframe.py b/tests/benchmarks/test_dataframe.py index f033a9b6fc..45adf2a328 100644 --- a/tests/benchmarks/test_dataframe.py +++ b/tests/benchmarks/test_dataframe.py @@ -1,12 +1,14 @@ import random import string +import dask import dask.dataframe as dd import numpy import pandas import pytest from dask.sizeof import sizeof from dask.utils import format_bytes +from packaging.version import Version from ..utils_test import cluster_memory, timeseries_of_size, wait @@ -67,6 +69,10 @@ def test_shuffle(small_client): wait(result, small_client, 20 * 60) +@pytest.mark.skipif( + Version(dask.__version__) < Version("2022.10.1"), + reason=" No support for pandas string dtypes in versions < 2022.10.1", +) @pytest.mark.parametrize("dtype", ["object", "string[python]", "string[pyarrow]"]) def test_shuffle_string(dtype, small_client): def make_partition(s, n=100_000):