-
Notifications
You must be signed in to change notification settings - Fork 112
Feature: introduces fuzzing with extension arrays #5819
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
0e7ab69
5b99fa6
fc25bd6
0bf82e0
ad63b19
49d26e5
981daba
bcf7135
8ef26ef
e06dc9c
b325a56
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,13 +1,16 @@ | ||
| // SPDX-License-Identifier: Apache-2.0 | ||
| // SPDX-FileCopyrightText: Copyright the Vortex contributors | ||
|
|
||
| use std::sync::Arc; | ||
|
|
||
| use vortex_array::ArrayRef; | ||
| use vortex_array::Canonical; | ||
| use vortex_array::IntoArray; | ||
| use vortex_array::ToCanonical; | ||
| use vortex_array::arrays::BoolArray; | ||
| use vortex_array::arrays::ConstantArray; | ||
| use vortex_array::arrays::DecimalArray; | ||
| use vortex_array::arrays::ExtensionArray; | ||
| use vortex_array::arrays::PrimitiveArray; | ||
| use vortex_array::arrays::VarBinViewArray; | ||
| use vortex_array::compute::fill_null; | ||
|
|
@@ -16,13 +19,13 @@ use vortex_array::vtable::ValidityHelper; | |
| use vortex_buffer::Buffer; | ||
| use vortex_buffer::BufferMut; | ||
| use vortex_dtype::DType; | ||
| use vortex_dtype::ExtDType; | ||
| use vortex_dtype::Nullability; | ||
| use vortex_dtype::match_each_decimal_value_type; | ||
| use vortex_dtype::match_each_native_ptype; | ||
| use vortex_error::VortexExpect; | ||
| use vortex_error::VortexResult; | ||
| use vortex_scalar::Scalar; | ||
|
|
||
| /// Apply fill_null on the canonical form of the array to get a consistent baseline. | ||
| /// This implementation manually fills null values for each canonical type | ||
| /// without using the fill_null method, to serve as an independent baseline for testing. | ||
|
|
@@ -40,13 +43,33 @@ pub fn fill_null_canonical_array( | |
| Canonical::VarBinView(array) => { | ||
| fill_varbinview_array(&array, fill_value, result_nullability) | ||
| } | ||
| Canonical::Struct(_) | ||
| | Canonical::List(_) | ||
| | Canonical::FixedSizeList(_) | ||
| | Canonical::Extension(_) => fill_null(canonical.as_ref(), fill_value)?, | ||
| Canonical::Extension(array) => fill_extension_array(&array, fill_value), | ||
|
||
| Canonical::Struct(_) | Canonical::List(_) | Canonical::FixedSizeList(_) => { | ||
| fill_null(canonical.as_ref(), fill_value)? | ||
| } | ||
| }) | ||
| } | ||
|
|
||
| fn fill_extension_array(array: &ExtensionArray, fill_value: &Scalar) -> ArrayRef { | ||
| let filled_storage = fill_null_canonical_array( | ||
| array.storage().to_canonical(), | ||
| &fill_value.as_extension().storage(), | ||
| ) | ||
| .vortex_expect("fill_null should succeed in canonical form"); | ||
|
|
||
| if filled_storage.dtype().nullability() == array.ext_dtype().storage_dtype().nullability() { | ||
| ExtensionArray::new(array.ext_dtype().clone(), filled_storage).into_array() | ||
| } else { | ||
| let new_ext_dtype = Arc::new(ExtDType::new( | ||
| array.ext_dtype().id().clone(), | ||
| Arc::new(filled_storage.dtype().clone()), | ||
| array.ext_dtype().metadata().cloned(), | ||
| )); | ||
|
||
|
|
||
| ExtensionArray::new(new_ext_dtype, filled_storage).into_array() | ||
| } | ||
| } | ||
|
|
||
| fn fill_bool_array( | ||
| array: &BoolArray, | ||
| fill_value: &Scalar, | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,20 +1,24 @@ | ||
| // SPDX-License-Identifier: Apache-2.0 | ||
| // SPDX-FileCopyrightText: Copyright the Vortex contributors | ||
|
|
||
| use std::sync::Arc; | ||
|
|
||
| use vortex_array::Array; | ||
| use vortex_array::ArrayRef; | ||
| use vortex_array::IntoArray; | ||
| use vortex_array::ToCanonical; | ||
| use vortex_array::accessor::ArrayAccessor; | ||
| use vortex_array::arrays::BoolArray; | ||
| use vortex_array::arrays::DecimalArray; | ||
| use vortex_array::arrays::ExtensionArray; | ||
| use vortex_array::arrays::PrimitiveArray; | ||
| use vortex_array::arrays::StructArray; | ||
| use vortex_array::arrays::VarBinViewArray; | ||
| use vortex_array::validity::Validity; | ||
| use vortex_buffer::BitBuffer; | ||
| use vortex_buffer::Buffer; | ||
| use vortex_dtype::DType; | ||
| use vortex_dtype::ExtDType; | ||
| use vortex_dtype::match_each_decimal_value_type; | ||
| use vortex_dtype::match_each_native_ptype; | ||
| use vortex_error::VortexResult; | ||
|
|
@@ -115,8 +119,23 @@ pub fn filter_canonical_array(array: &dyn Array, filter: &[bool]) -> VortexResul | |
| } | ||
| take_canonical_array_non_nullable_indices(array, indices.as_slice()) | ||
| } | ||
| d @ (DType::Null | DType::Extension(_)) => { | ||
| unreachable!("DType {d} not supported for fuzzing") | ||
| DType::Extension(ext_dtype) => { | ||
| // Extension arrays delegate filter to their storage type | ||
| let filtered_storage = filter_canonical_array(array.to_extension().storage(), filter)?; | ||
|
|
||
| if filtered_storage.dtype().nullability() == ext_dtype.storage_dtype().nullability() { | ||
| Ok(ExtensionArray::new(ext_dtype.clone(), filtered_storage).into_array()) | ||
| } else { | ||
| let new_ext_dtype = Arc::new(ExtDType::new( | ||
| ext_dtype.id().clone(), | ||
| Arc::new(filtered_storage.dtype().clone()), | ||
| ext_dtype.metadata().cloned(), | ||
| )); | ||
| Ok(ExtensionArray::new(new_ext_dtype, filtered_storage).into_array()) | ||
| } | ||
| } | ||
| DType::Null => { | ||
| unreachable!("Cannot search sorted on Null array") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is not true |
||
| } | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,5 +1,6 @@ | ||
| // SPDX-License-Identifier: Apache-2.0 | ||
| // SPDX-FileCopyrightText: Copyright the Vortex contributors | ||
| use std::sync::Arc; | ||
|
|
||
| use vortex_array::Array; | ||
| use vortex_array::ArrayRef; | ||
|
|
@@ -8,13 +9,15 @@ use vortex_array::ToCanonical; | |
| use vortex_array::accessor::ArrayAccessor; | ||
| use vortex_array::arrays::BoolArray; | ||
| use vortex_array::arrays::DecimalArray; | ||
| use vortex_array::arrays::ExtensionArray; | ||
| use vortex_array::arrays::FixedSizeListArray; | ||
| use vortex_array::arrays::ListViewArray; | ||
| use vortex_array::arrays::PrimitiveArray; | ||
| use vortex_array::arrays::StructArray; | ||
| use vortex_array::arrays::VarBinViewArray; | ||
| use vortex_array::validity::Validity; | ||
| use vortex_dtype::DType; | ||
| use vortex_dtype::ExtDType; | ||
| use vortex_dtype::match_each_decimal_value_type; | ||
| use vortex_dtype::match_each_native_ptype; | ||
| use vortex_error::VortexResult; | ||
|
|
@@ -113,8 +116,24 @@ pub fn slice_canonical_array( | |
| .to_array(), | ||
| ) | ||
| } | ||
| d @ (DType::Null | DType::Extension(_)) => { | ||
| unreachable!("DType {d} not supported for fuzzing") | ||
| DType::Extension(ext_dtype) => { | ||
| // Extension arrays delegate slicing to their storage type | ||
| let sliced_storage = | ||
| slice_canonical_array(array.to_extension().storage(), start, stop)?; | ||
|
|
||
| if sliced_storage.dtype().nullability() == ext_dtype.storage_dtype().nullability() { | ||
| Ok(ExtensionArray::new(ext_dtype.clone(), sliced_storage).into_array()) | ||
| } else { | ||
| let new_ext_dtype = Arc::new(ExtDType::new( | ||
| ext_dtype.id().clone(), | ||
| Arc::new(sliced_storage.dtype().clone()), | ||
| ext_dtype.metadata().cloned(), | ||
| )); | ||
| Ok(ExtensionArray::new(new_ext_dtype, sliced_storage).into_array()) | ||
| } | ||
|
Comment on lines
123
to
136
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually this seems to be duplicated in a few files, maybe we can extract this out into a helper method and put it in |
||
| } | ||
| DType::Null => { | ||
| unreachable!("Cannot search sorted on Null array") | ||
| } | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -10,6 +10,7 @@ use arbitrary::Unstructured; | |
| use vortex_buffer::BitBuffer; | ||
| use vortex_buffer::Buffer; | ||
| use vortex_dtype::DType; | ||
| use vortex_dtype::ExtDType; | ||
| use vortex_dtype::IntegerPType; | ||
| use vortex_dtype::NativePType; | ||
| use vortex_dtype::Nullability; | ||
|
|
@@ -164,12 +165,42 @@ fn random_array_chunk( | |
| DType::FixedSizeList(elem_dtype, list_size, null) => { | ||
| random_fixed_size_list(u, elem_dtype, *list_size, *null, chunk_len) | ||
| } | ||
| DType::Extension(..) => { | ||
| todo!("Extension arrays are not implemented") | ||
| } | ||
| DType::Extension(ext_dtype) => random_extension(u, ext_dtype, chunk_len), | ||
| } | ||
| } | ||
|
|
||
| /// Creates a random extension array. | ||
| /// | ||
| /// If the `chunk_len` is specified, the length of the array will be equal to the chunk length. | ||
| fn random_extension( | ||
| u: &mut Unstructured, | ||
| ext_dtype: &Arc<ExtDType>, | ||
| chunk_len: Option<usize>, | ||
| ) -> Result<ArrayRef> { | ||
| use crate::builders::ExtensionBuilder; | ||
|
||
|
|
||
| // Determine array length | ||
| let array_length = chunk_len.unwrap_or(u.int_in_range(0..=20)?); | ||
|
|
||
| // Create builder for the extension array | ||
| let mut builder = ExtensionBuilder::with_capacity(ext_dtype.clone(), array_length); | ||
|
|
||
| // Generate random values | ||
| for _ in 0..array_length { | ||
| // Wrap in extension scalar using Scalar::extension() | ||
| let ext_scalar = Scalar::extension( | ||
| ext_dtype.clone(), | ||
| random_scalar(u, ext_dtype.storage_dtype())?, | ||
| ); | ||
|
|
||
| // Append to builder | ||
| builder | ||
| .append_scalar(&ext_scalar) | ||
| .vortex_expect("can append extension scalar"); | ||
| } | ||
|
|
||
| Ok(builder.finish()) | ||
| } | ||
| /// Creates a random fixed-size list array. | ||
| /// | ||
| /// If the `chunk_len` is specified, the length of the array will be equal to the chunk length. | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit: periods at the end of sentences (this is true everywhere)