Skip to content

Commit d68b629

Browse files
authored
Add hashing microbenchmark with_hashes (#19373)
## Which issue does this PR close? - Part of #18411 ## Rationale for this change I want to optimize hashing for StringViewArray. In order to do I would like a benchmark to show it works ## What changes are included in this PR? Add benchmark for `with_hashes` Run like ```shell cargo bench --bench with_hashes ``` Note I did not add all the possible types of arrays as I don't plan to optimize othrs ## Are these changes tested? I ran it manually ## Are there any user-facing changes? <!-- If there are user-facing changes then we may require documentation to be updated before approving the PR. --> <!-- If there are any breaking changes to public APIs, please add the `api change` label. -->
1 parent 4e7bba4 commit d68b629

File tree

3 files changed

+219
-4
lines changed

3 files changed

+219
-4
lines changed

Cargo.lock

Lines changed: 5 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

datafusion/common/Cargo.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,10 @@ recursive_protection = ["dep:recursive"]
5353
parquet = ["dep:parquet"]
5454
sql = ["sqlparser"]
5555

56+
[[bench]]
57+
harness = false
58+
name = "with_hashes"
59+
5660
[dependencies]
5761
ahash = { workspace = true }
5862
apache-avro = { workspace = true, features = [
@@ -82,6 +86,7 @@ web-time = "1.1.0"
8286

8387
[dev-dependencies]
8488
chrono = { workspace = true }
89+
criterion = { workspace = true }
8590
insta = { workspace = true }
8691
rand = { workspace = true }
8792
sqlparser = { workspace = true }
Lines changed: 209 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,209 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
//! Benchmarks for `with_hashes` function
19+
20+
use ahash::RandomState;
21+
use arrow::array::{
22+
Array, ArrayRef, ArrowPrimitiveType, DictionaryArray, GenericStringArray,
23+
NullBufferBuilder, OffsetSizeTrait, PrimitiveArray, StringViewArray, make_array,
24+
};
25+
use arrow::buffer::NullBuffer;
26+
use arrow::datatypes::{ArrowDictionaryKeyType, Int32Type, Int64Type};
27+
use criterion::{Bencher, Criterion, criterion_group, criterion_main};
28+
use datafusion_common::hash_utils::with_hashes;
29+
use rand::Rng;
30+
use rand::SeedableRng;
31+
use rand::distr::{Alphanumeric, Distribution, StandardUniform};
32+
use rand::prelude::StdRng;
33+
use std::sync::Arc;
34+
35+
const BATCH_SIZE: usize = 8192;
36+
37+
struct BenchData {
38+
name: &'static str,
39+
array: ArrayRef,
40+
}
41+
42+
fn criterion_benchmark(c: &mut Criterion) {
43+
let pool = StringPool::new(100, 64);
44+
// poll with small strings for string view tests (<=12 bytes are inlined)
45+
let small_pool = StringPool::new(100, 5);
46+
let cases = [
47+
BenchData {
48+
name: "int64",
49+
array: primitive_array::<Int64Type>(BATCH_SIZE),
50+
},
51+
BenchData {
52+
name: "utf8",
53+
array: pool.string_array::<i32>(BATCH_SIZE),
54+
},
55+
BenchData {
56+
name: "large_utf8",
57+
array: pool.string_array::<i64>(BATCH_SIZE),
58+
},
59+
BenchData {
60+
name: "utf8_view",
61+
array: pool.string_view_array(BATCH_SIZE),
62+
},
63+
BenchData {
64+
name: "utf8_view (small)",
65+
array: small_pool.string_view_array(BATCH_SIZE),
66+
},
67+
BenchData {
68+
name: "dictionary_utf8_int32",
69+
array: pool.dictionary_array::<Int32Type>(BATCH_SIZE),
70+
},
71+
];
72+
73+
for BenchData { name, array } in cases {
74+
// with_hash has different code paths for single vs multiple arrays and nulls vs no nulls
75+
let nullable_array = add_nulls(&array);
76+
c.bench_function(&format!("{name}: single, no nulls"), |b| {
77+
do_hash_test(b, std::slice::from_ref(&array));
78+
});
79+
c.bench_function(&format!("{name}: single, nulls"), |b| {
80+
do_hash_test(b, std::slice::from_ref(&nullable_array));
81+
});
82+
c.bench_function(&format!("{name}: multiple, no nulls"), |b| {
83+
let arrays = vec![array.clone(), array.clone(), array.clone()];
84+
do_hash_test(b, &arrays);
85+
});
86+
c.bench_function(&format!("{name}: multiple, nulls"), |b| {
87+
let arrays = vec![
88+
nullable_array.clone(),
89+
nullable_array.clone(),
90+
nullable_array.clone(),
91+
];
92+
do_hash_test(b, &arrays);
93+
});
94+
}
95+
}
96+
97+
fn do_hash_test(b: &mut Bencher, arrays: &[ArrayRef]) {
98+
let state = RandomState::new();
99+
b.iter(|| {
100+
with_hashes(arrays, &state, |hashes| {
101+
assert_eq!(hashes.len(), BATCH_SIZE); // make sure the result is used
102+
Ok(())
103+
})
104+
.unwrap();
105+
});
106+
}
107+
108+
fn create_null_mask(len: usize) -> NullBuffer
109+
where
110+
StandardUniform: Distribution<bool>,
111+
{
112+
let mut rng = make_rng();
113+
let null_density = 0.03;
114+
let mut builder = NullBufferBuilder::new(len);
115+
for _ in 0..len {
116+
if rng.random::<f32>() < null_density {
117+
builder.append_null();
118+
} else {
119+
builder.append_non_null();
120+
}
121+
}
122+
builder.finish().expect("should be nulls in buffer")
123+
}
124+
125+
// Returns an new array that is the same as array, but with nulls
126+
fn add_nulls(array: &ArrayRef) -> ArrayRef {
127+
let array_data = array
128+
.clone()
129+
.into_data()
130+
.into_builder()
131+
.nulls(Some(create_null_mask(array.len())))
132+
.build()
133+
.unwrap();
134+
make_array(array_data)
135+
}
136+
137+
pub fn make_rng() -> StdRng {
138+
StdRng::seed_from_u64(42)
139+
}
140+
141+
/// String pool for generating low cardinality data (for dictionaries and string views)
142+
struct StringPool {
143+
strings: Vec<String>,
144+
}
145+
146+
impl StringPool {
147+
/// Create a new string pool with the given number of random strings
148+
/// each having between 1 and max_length characters.
149+
fn new(pool_size: usize, max_length: usize) -> Self {
150+
let mut rng = make_rng();
151+
let mut strings = Vec::with_capacity(pool_size);
152+
for _ in 0..pool_size {
153+
let len = rng.random_range(1..=max_length);
154+
let value: Vec<u8> =
155+
rng.clone().sample_iter(&Alphanumeric).take(len).collect();
156+
strings.push(String::from_utf8(value).unwrap());
157+
}
158+
Self { strings }
159+
}
160+
161+
/// Return an iterator over &str of the given length with values randomly chosen from the pool
162+
fn iter_strings(&self, len: usize) -> impl Iterator<Item = &str> {
163+
let mut rng = make_rng();
164+
(0..len).map(move |_| {
165+
let idx = rng.random_range(0..self.strings.len());
166+
self.strings[idx].as_str()
167+
})
168+
}
169+
170+
/// Return a StringArray of the given length with values randomly chosen from the pool
171+
fn string_array<O: OffsetSizeTrait>(&self, array_length: usize) -> ArrayRef {
172+
Arc::new(GenericStringArray::<O>::from_iter_values(
173+
self.iter_strings(array_length),
174+
))
175+
}
176+
177+
/// Return a StringViewArray of the given length with values randomly chosen from the pool
178+
fn string_view_array(&self, array_length: usize) -> ArrayRef {
179+
Arc::new(StringViewArray::from_iter_values(
180+
self.iter_strings(array_length),
181+
))
182+
}
183+
184+
/// Return a DictionaryArray of the given length with values randomly chosen from the pool
185+
fn dictionary_array<T: ArrowDictionaryKeyType>(
186+
&self,
187+
array_length: usize,
188+
) -> ArrayRef {
189+
Arc::new(DictionaryArray::<T>::from_iter(
190+
self.iter_strings(array_length),
191+
))
192+
}
193+
}
194+
195+
pub fn primitive_array<T>(array_len: usize) -> ArrayRef
196+
where
197+
T: ArrowPrimitiveType,
198+
StandardUniform: Distribution<T::Native>,
199+
{
200+
let mut rng = make_rng();
201+
202+
let array: PrimitiveArray<T> = (0..array_len)
203+
.map(|_| Some(rng.random::<T::Native>()))
204+
.collect();
205+
Arc::new(array)
206+
}
207+
208+
criterion_group!(benches, criterion_benchmark);
209+
criterion_main!(benches);

0 commit comments

Comments
 (0)