From 0e7ab69ad45b73c81ae22811e90f335e06c85ddc Mon Sep 17 00:00:00 2001 From: Pratham Agarwal Date: Fri, 26 Dec 2025 11:02:51 +0530 Subject: [PATCH 1/2] feat: introduces fuzzing with extension arrays Signed-off-by: Pratham Agarwal --- fuzz/src/array/compare.rs | 12 +++++++-- fuzz/src/array/fill_null.rs | 33 ++++++++++++++++++++---- fuzz/src/array/filter.rs | 23 +++++++++++++++-- fuzz/src/array/search_sorted.rs | 12 +++++++-- fuzz/src/array/slice.rs | 23 +++++++++++++++-- fuzz/src/array/sort.rs | 23 +++++++++++++++-- fuzz/src/array/take.rs | 24 ++++++++++++++--- vortex-array/src/arrays/arbitrary.rs | 37 ++++++++++++++++++++++++--- vortex-array/src/compute/fill_null.rs | 34 ++++++++++++++++++++++++ vortex-dtype/src/arbitrary/mod.rs | 34 ++++++++++++++++++++++-- vortex-scalar/src/arbitrary.rs | 4 +-- 11 files changed, 233 insertions(+), 26 deletions(-) diff --git a/fuzz/src/array/compare.rs b/fuzz/src/array/compare.rs index 618b2647892..127b08b2493 100644 --- a/fuzz/src/array/compare.rs +++ b/fuzz/src/array/compare.rs @@ -123,8 +123,16 @@ pub fn compare_canonical_array(array: &dyn Array, value: &Scalar, operator: Oper ) .into_array() } - d @ (DType::Null | DType::Extension(_)) => { - unreachable!("DType {d} not supported for fuzzing") + DType::Null => { + unreachable!("DType null not supported for fuzzing") + } + DType::Extension(..) => { + // Extension arrays delegate comparison to their storage type + compare_canonical_array( + array.to_extension().storage(), + &value.as_extension().storage(), + operator, + ) } } } diff --git a/fuzz/src/array/fill_null.rs b/fuzz/src/array/fill_null.rs index adb52d844c4..708cd5e9c55 100644 --- a/fuzz/src/array/fill_null.rs +++ b/fuzz/src/array/fill_null.rs @@ -1,6 +1,8 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors +use std::sync::Arc; + use vortex_array::ArrayRef; use vortex_array::Canonical; use vortex_array::IntoArray; @@ -8,6 +10,7 @@ use vortex_array::ToCanonical; use vortex_array::arrays::BoolArray; use vortex_array::arrays::ConstantArray; use vortex_array::arrays::DecimalArray; +use vortex_array::arrays::ExtensionArray; use vortex_array::arrays::PrimitiveArray; use vortex_array::arrays::VarBinViewArray; use vortex_array::compute::fill_null; @@ -16,13 +19,13 @@ use vortex_array::vtable::ValidityHelper; use vortex_buffer::Buffer; use vortex_buffer::BufferMut; use vortex_dtype::DType; +use vortex_dtype::ExtDType; use vortex_dtype::Nullability; use vortex_dtype::match_each_decimal_value_type; use vortex_dtype::match_each_native_ptype; use vortex_error::VortexExpect; use vortex_error::VortexResult; use vortex_scalar::Scalar; - /// Apply fill_null on the canonical form of the array to get a consistent baseline. /// This implementation manually fills null values for each canonical type /// without using the fill_null method, to serve as an independent baseline for testing. @@ -40,13 +43,33 @@ pub fn fill_null_canonical_array( Canonical::VarBinView(array) => { fill_varbinview_array(&array, fill_value, result_nullability) } - Canonical::Struct(_) - | Canonical::List(_) - | Canonical::FixedSizeList(_) - | Canonical::Extension(_) => fill_null(canonical.as_ref(), fill_value)?, + Canonical::Extension(array) => fill_extension_array(&array, fill_value), + Canonical::Struct(_) | Canonical::List(_) | Canonical::FixedSizeList(_) => { + fill_null(canonical.as_ref(), fill_value)? + } }) } +fn fill_extension_array(array: &ExtensionArray, fill_value: &Scalar) -> ArrayRef { + let filled_storage = fill_null_canonical_array( + array.storage().to_canonical(), + &fill_value.as_extension().storage(), + ) + .vortex_expect("fill_null should succeed in canonical form"); + + if filled_storage.dtype().nullability() == array.ext_dtype().storage_dtype().nullability() { + ExtensionArray::new(array.ext_dtype().clone(), filled_storage).into_array() + } else { + let new_ext_dtype = Arc::new(ExtDType::new( + array.ext_dtype().id().clone(), + Arc::new(filled_storage.dtype().clone()), + array.ext_dtype().metadata().cloned(), + )); + + ExtensionArray::new(new_ext_dtype, filled_storage).into_array() + } +} + fn fill_bool_array( array: &BoolArray, fill_value: &Scalar, diff --git a/fuzz/src/array/filter.rs b/fuzz/src/array/filter.rs index 4d6fe1c0e03..b8ab6fe02f9 100644 --- a/fuzz/src/array/filter.rs +++ b/fuzz/src/array/filter.rs @@ -1,6 +1,8 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors +use std::sync::Arc; + use vortex_array::Array; use vortex_array::ArrayRef; use vortex_array::IntoArray; @@ -8,6 +10,7 @@ use vortex_array::ToCanonical; use vortex_array::accessor::ArrayAccessor; use vortex_array::arrays::BoolArray; use vortex_array::arrays::DecimalArray; +use vortex_array::arrays::ExtensionArray; use vortex_array::arrays::PrimitiveArray; use vortex_array::arrays::StructArray; use vortex_array::arrays::VarBinViewArray; @@ -15,6 +18,7 @@ use vortex_array::validity::Validity; use vortex_buffer::BitBuffer; use vortex_buffer::Buffer; use vortex_dtype::DType; +use vortex_dtype::ExtDType; use vortex_dtype::match_each_decimal_value_type; use vortex_dtype::match_each_native_ptype; use vortex_error::VortexResult; @@ -115,8 +119,23 @@ pub fn filter_canonical_array(array: &dyn Array, filter: &[bool]) -> VortexResul } take_canonical_array_non_nullable_indices(array, indices.as_slice()) } - d @ (DType::Null | DType::Extension(_)) => { - unreachable!("DType {d} not supported for fuzzing") + DType::Extension(ext_dtype) => { + // Extension arrays delegate filter to their storage type + let filtered_storage = filter_canonical_array(array.to_extension().storage(), filter)?; + + if filtered_storage.dtype().nullability() == ext_dtype.storage_dtype().nullability() { + Ok(ExtensionArray::new(ext_dtype.clone(), filtered_storage).into_array()) + } else { + let new_ext_dtype = Arc::new(ExtDType::new( + ext_dtype.id().clone(), + Arc::new(filtered_storage.dtype().clone()), + ext_dtype.metadata().cloned(), + )); + Ok(ExtensionArray::new(new_ext_dtype, filtered_storage).into_array()) + } + } + DType::Null => { + unreachable!("Cannot search sorted on Null array") } } } diff --git a/fuzz/src/array/search_sorted.rs b/fuzz/src/array/search_sorted.rs index 0f0e7349e99..52181883d1a 100644 --- a/fuzz/src/array/search_sorted.rs +++ b/fuzz/src/array/search_sorted.rs @@ -129,8 +129,16 @@ pub fn search_sorted_canonical_array( let scalar_vals = (0..array.len()).map(|i| array.scalar_at(i)).collect_vec(); Ok(scalar_vals.search_sorted(&scalar.cast(array.dtype())?, side)) } - d @ (DType::Null | DType::Extension(_)) => { - unreachable!("DType {d} not supported for fuzzing") + DType::Extension(..) => { + // Extension arrays delegate search to their storage type + search_sorted_canonical_array( + array.to_extension().storage(), + &scalar.as_extension().storage(), + side, + ) + } + DType::Null => { + unreachable!("Cannot search sorted on Null array") } } } diff --git a/fuzz/src/array/slice.rs b/fuzz/src/array/slice.rs index eaa22a5dcf1..9aff43c6b21 100644 --- a/fuzz/src/array/slice.rs +++ b/fuzz/src/array/slice.rs @@ -1,5 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors +use std::sync::Arc; use vortex_array::Array; use vortex_array::ArrayRef; @@ -8,6 +9,7 @@ use vortex_array::ToCanonical; use vortex_array::accessor::ArrayAccessor; use vortex_array::arrays::BoolArray; use vortex_array::arrays::DecimalArray; +use vortex_array::arrays::ExtensionArray; use vortex_array::arrays::FixedSizeListArray; use vortex_array::arrays::ListViewArray; use vortex_array::arrays::PrimitiveArray; @@ -15,6 +17,7 @@ use vortex_array::arrays::StructArray; use vortex_array::arrays::VarBinViewArray; use vortex_array::validity::Validity; use vortex_dtype::DType; +use vortex_dtype::ExtDType; use vortex_dtype::match_each_decimal_value_type; use vortex_dtype::match_each_native_ptype; use vortex_error::VortexResult; @@ -113,8 +116,24 @@ pub fn slice_canonical_array( .to_array(), ) } - d @ (DType::Null | DType::Extension(_)) => { - unreachable!("DType {d} not supported for fuzzing") + DType::Extension(ext_dtype) => { + // Extension arrays delegate slicing to their storage type + let sliced_storage = + slice_canonical_array(array.to_extension().storage(), start, stop)?; + + if sliced_storage.dtype().nullability() == ext_dtype.storage_dtype().nullability() { + Ok(ExtensionArray::new(ext_dtype.clone(), sliced_storage).into_array()) + } else { + let new_ext_dtype = Arc::new(ExtDType::new( + ext_dtype.id().clone(), + Arc::new(sliced_storage.dtype().clone()), + ext_dtype.metadata().cloned(), + )); + Ok(ExtensionArray::new(new_ext_dtype, sliced_storage).into_array()) + } + } + DType::Null => { + unreachable!("Cannot search sorted on Null array") } } } diff --git a/fuzz/src/array/sort.rs b/fuzz/src/array/sort.rs index b1ffaf2f28e..a6e7049f6a9 100644 --- a/fuzz/src/array/sort.rs +++ b/fuzz/src/array/sort.rs @@ -2,6 +2,7 @@ // SPDX-FileCopyrightText: Copyright the Vortex contributors use std::cmp::Ordering; +use std::sync::Arc; use vortex_array::Array; use vortex_array::ArrayRef; @@ -10,9 +11,11 @@ use vortex_array::ToCanonical; use vortex_array::accessor::ArrayAccessor; use vortex_array::arrays::BoolArray; use vortex_array::arrays::DecimalArray; +use vortex_array::arrays::ExtensionArray; use vortex_array::arrays::PrimitiveArray; use vortex_array::arrays::VarBinViewArray; use vortex_dtype::DType; +use vortex_dtype::ExtDType; use vortex_dtype::NativePType; use vortex_dtype::match_each_decimal_value_type; use vortex_dtype::match_each_native_ptype; @@ -80,8 +83,24 @@ pub fn sort_canonical_array(array: &dyn Array) -> VortexResult { }); take_canonical_array_non_nullable_indices(array, &sort_indices) } - d @ (DType::Null | DType::Extension(_)) => { - unreachable!("DType {d} not supported for fuzzing") + DType::Null => { + // Null arrays don't need sorting - all elements are null + Ok(array.to_array()) + } + DType::Extension(ext_dtype) => { + // Extension arrays delegate sorting to their storage type + let sorted_storage = sort_canonical_array(array.to_extension().storage())?; + + if sorted_storage.dtype().nullability() == ext_dtype.storage_dtype().nullability() { + Ok(ExtensionArray::new(ext_dtype.clone(), sorted_storage).into_array()) + } else { + let new_ext_dtype = Arc::new(ExtDType::new( + ext_dtype.id().clone(), + Arc::new(sorted_storage.dtype().clone()), + ext_dtype.metadata().cloned(), + )); + Ok(ExtensionArray::new(new_ext_dtype, sorted_storage).into_array()) + } } } } diff --git a/fuzz/src/array/take.rs b/fuzz/src/array/take.rs index 6d4a3b7660b..f5e87a23ec4 100644 --- a/fuzz/src/array/take.rs +++ b/fuzz/src/array/take.rs @@ -1,6 +1,8 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors +use std::sync::Arc; + use vortex_array::Array; use vortex_array::ArrayRef; use vortex_array::IntoArray; @@ -8,6 +10,7 @@ use vortex_array::ToCanonical; use vortex_array::accessor::ArrayAccessor; use vortex_array::arrays::BoolArray; use vortex_array::arrays::DecimalArray; +use vortex_array::arrays::ExtensionArray; use vortex_array::arrays::PrimitiveArray; use vortex_array::arrays::StructArray; use vortex_array::arrays::VarBinViewArray; @@ -16,6 +19,7 @@ use vortex_array::validity::Validity; use vortex_buffer::Buffer; use vortex_dtype::DType; use vortex_dtype::DecimalDType; +use vortex_dtype::ExtDType; use vortex_dtype::NativeDecimalType; use vortex_dtype::NativePType; use vortex_dtype::Nullability; @@ -23,7 +27,6 @@ use vortex_dtype::match_each_decimal_value_type; use vortex_dtype::match_each_native_ptype; use vortex_error::VortexExpect; use vortex_error::VortexResult; - pub fn take_canonical_array_non_nullable_indices( array: &dyn Array, indices: &[usize], @@ -141,8 +144,23 @@ pub fn take_canonical_array( } Ok(builder.finish()) } - d @ (DType::Null | DType::Extension(_)) => { - unreachable!("DType {d} not supported for fuzzing") + DType::Extension(ext_dtype) => { + // Extension arrays delegate take to their storage type + let taken_storage = take_canonical_array(array.to_extension().storage(), indices)?; + + if taken_storage.dtype().nullability() == ext_dtype.storage_dtype().nullability() { + Ok(ExtensionArray::new(ext_dtype.clone(), taken_storage).into_array()) + } else { + let new_ext_dtype = Arc::new(ExtDType::new( + ext_dtype.id().clone(), + Arc::new(taken_storage.dtype().clone()), + ext_dtype.metadata().cloned(), + )); + Ok(ExtensionArray::new(new_ext_dtype, taken_storage).into_array()) + } + } + DType::Null => { + unreachable!("Null type not supported for fuzzing") } } } diff --git a/vortex-array/src/arrays/arbitrary.rs b/vortex-array/src/arrays/arbitrary.rs index 64da1c49778..e2ba09684ed 100644 --- a/vortex-array/src/arrays/arbitrary.rs +++ b/vortex-array/src/arrays/arbitrary.rs @@ -10,6 +10,7 @@ use arbitrary::Unstructured; use vortex_buffer::BitBuffer; use vortex_buffer::Buffer; use vortex_dtype::DType; +use vortex_dtype::ExtDType; use vortex_dtype::IntegerPType; use vortex_dtype::NativePType; use vortex_dtype::Nullability; @@ -164,12 +165,42 @@ fn random_array_chunk( DType::FixedSizeList(elem_dtype, list_size, null) => { random_fixed_size_list(u, elem_dtype, *list_size, *null, chunk_len) } - DType::Extension(..) => { - todo!("Extension arrays are not implemented") - } + DType::Extension(ext_dtype) => random_extension(u, ext_dtype, chunk_len), } } +/// Creates a random extension array. +/// +/// If the `chunk_len` is specified, the length of the array will be equal to the chunk length. +fn random_extension( + u: &mut Unstructured, + ext_dtype: &Arc, + chunk_len: Option, +) -> Result { + use crate::builders::ExtensionBuilder; + + // Determine array length + let array_length = chunk_len.unwrap_or(u.int_in_range(0..=20)?); + + // Create builder for the extension array + let mut builder = ExtensionBuilder::with_capacity(ext_dtype.clone(), array_length); + + // Generate random values + for _ in 0..array_length { + // Wrap in extension scalar using Scalar::extension() + let ext_scalar = Scalar::extension( + ext_dtype.clone(), + random_scalar(u, ext_dtype.storage_dtype())?, + ); + + // Append to builder + builder + .append_scalar(&ext_scalar) + .vortex_expect("can append extension scalar"); + } + + Ok(builder.finish()) +} /// Creates a random fixed-size list array. /// /// If the `chunk_len` is specified, the length of the array will be equal to the chunk length. diff --git a/vortex-array/src/compute/fill_null.rs b/vortex-array/src/compute/fill_null.rs index 3c5379116d3..b390beb3680 100644 --- a/vortex-array/src/compute/fill_null.rs +++ b/vortex-array/src/compute/fill_null.rs @@ -1,10 +1,12 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors +use std::sync::Arc; use std::sync::LazyLock; use arcref::ArcRef; use vortex_dtype::DType; +use vortex_dtype::ExtDType; use vortex_error::VortexError; use vortex_error::VortexResult; use vortex_error::vortex_bail; @@ -15,6 +17,8 @@ use crate::Array; use crate::ArrayRef; use crate::IntoArray; use crate::arrays::ConstantArray; +use crate::arrays::ExtensionArray; +use crate::canonical::ToCanonical; use crate::compute::ComputeFn; use crate::compute::ComputeFnVTable; use crate::compute::InvocationArgs; @@ -125,6 +129,36 @@ impl ComputeFnVTable for FillNull { return Ok(fill_null(canonical_arr.as_ref(), fill_value)?.into()); } + if matches!(array.dtype(), DType::Extension(..)) { + let filled_storage = fill_null( + array.to_extension().storage(), + &fill_value.as_extension().storage(), + )?; + + if filled_storage.dtype().nullability() + == array + .to_extension() + .ext_dtype() + .storage_dtype() + .nullability() + { + return Ok(ExtensionArray::new( + array.to_extension().ext_dtype().clone(), + filled_storage, + ) + .into_array() + .into()); + } else { + let new_ext_dtype = Arc::new(ExtDType::new( + array.to_extension().ext_dtype().id().clone(), + Arc::new(filled_storage.dtype().clone()), + array.to_extension().ext_dtype().metadata().cloned(), + )); + return Ok(ExtensionArray::new(new_ext_dtype, filled_storage) + .into_array() + .into()); + } + } // TODO(joe): update fuzzer when fixed vortex_bail!("fill null not implemented for DType {}", array.dtype()) } diff --git a/vortex-dtype/src/arbitrary/mod.rs b/vortex-dtype/src/arbitrary/mod.rs index 2ea4421098d..d159c157e87 100644 --- a/vortex-dtype/src/arbitrary/mod.rs +++ b/vortex-dtype/src/arbitrary/mod.rs @@ -9,6 +9,8 @@ use arbitrary::Unstructured; use crate::DType; use crate::DecimalDType; +use crate::ExtDType; +use crate::ExtID; use crate::FieldName; use crate::FieldNames; use crate::NativeDecimalType; @@ -16,7 +18,6 @@ use crate::Nullability; use crate::PType; use crate::StructFields; use crate::i256; - mod decimal; impl<'a> Arbitrary<'a> for DType { @@ -35,7 +36,7 @@ impl<'a> Arbitrary<'a> for FieldName { fn random_dtype(u: &mut Unstructured<'_>, depth: u8) -> Result { const BASE_TYPE_COUNT: i32 = 5; // TODO(joe): update to 3 once fsl works - const CONTAINER_TYPE_COUNT: i32 = 2; + const CONTAINER_TYPE_COUNT: i32 = 3; let max_dtype_kind = if depth == 0 { BASE_TYPE_COUNT } else { @@ -52,6 +53,8 @@ fn random_dtype(u: &mut Unstructured<'_>, depth: u8) -> Result { // container types 6 => DType::Struct(random_struct_dtype(u, depth - 1)?, u.arbitrary()?), 7 => DType::List(Arc::new(random_dtype(u, depth - 1)?), u.arbitrary()?), + 8 => DType::Extension(Arc::new(random_ext_dtype(u, depth - 1)?)), + // 8 => DType::FixedSizeList( // Arc::new(random_dtype(u, depth - 1)?), // // We limit the list size to 3 rather (following random struct fields). @@ -102,6 +105,33 @@ impl<'a> Arbitrary<'a> for DecimalDType { } } +impl<'a> Arbitrary<'a> for ExtDType { + fn arbitrary(u: &mut Unstructured<'a>) -> Result { + random_ext_dtype(u, 1) + } +} + +fn random_ext_dtype(u: &mut Unstructured<'_>, _depth: u8) -> Result { + let id_str = u.choose(&[ + "test.ext", + "example.currency", + "example.uuid", + "example.ipv4", + ])?; + let ext_id = ExtID::from(*id_str); + + // Supports only base types for now + let storage_dtype = match u.int_in_range(1..=3)? { + // base types + 1 => DType::Bool(u.arbitrary()?), + 2 => DType::Primitive(u.arbitrary()?, u.arbitrary()?), + 3 => DType::Decimal(u.arbitrary()?, u.arbitrary()?), + _ => unreachable!("int_in_range(1..=3) returned value out of range"), + }; + + Ok(ExtDType::new(ext_id, storage_dtype.into(), None)) +} + impl<'a> Arbitrary<'a> for StructFields { fn arbitrary(u: &mut Unstructured<'a>) -> Result { random_struct_dtype(u, 1) diff --git a/vortex-scalar/src/arbitrary.rs b/vortex-scalar/src/arbitrary.rs index 45e4d9e85ec..274c4f5ff49 100644 --- a/vortex-scalar/src/arbitrary.rs +++ b/vortex-scalar/src/arbitrary.rs @@ -67,9 +67,7 @@ fn random_scalar_value(u: &mut Unstructured, dtype: &DType) -> Result>>()? .into(), ))), - DType::Extension(..) => { - unreachable!("Can't yet generate arbitrary scalars for ext dtype") - } + DType::Extension(ext_dtype) => random_scalar_value(u, ext_dtype.storage_dtype()), } } From 5b99fa6fd0c2d9505d2ce0bdf156b4d9c39360a6 Mon Sep 17 00:00:00 2001 From: Pratham Agarwal Date: Sat, 27 Dec 2025 22:44:46 +0530 Subject: [PATCH 2/2] feat: added validator functions for extension arrays Signed-off-by: Pratham Agarwal --- fuzz/src/array/mod.rs | 18 +++ fuzz/src/error.rs | 10 ++ vortex-array/src/arrays/arbitrary.rs | 34 +++-- vortex-array/src/arrays/extension/mod.rs | 3 + vortex-array/src/arrays/extension/validate.rs | 138 ++++++++++++++++++ vortex-dtype/src/arbitrary/mod.rs | 83 ++++++++--- 6 files changed, 255 insertions(+), 31 deletions(-) create mode 100644 vortex-array/src/arrays/extension/validate.rs diff --git a/fuzz/src/array/mod.rs b/fuzz/src/array/mod.rs index e93fe96bf7f..7fa5babc317 100644 --- a/fuzz/src/array/mod.rs +++ b/fuzz/src/array/mod.rs @@ -662,6 +662,9 @@ pub fn assert_array_eq( rhs: &ArrayRef, step: usize, ) -> crate::error::VortexFuzzResult<()> { + use vortex_array::ToCanonical; + use vortex_array::arrays::validator_for_ext_type; + use crate::error::Backtrace; use crate::error::VortexFuzzError; @@ -699,7 +702,22 @@ pub fn assert_array_eq( Backtrace::capture(), )); } + + // Also validate the expected array's domain constraints for extension types + if matches!(lhs.dtype(), DType::Extension(..)) { + let validator = validator_for_ext_type(lhs.to_extension().ext_dtype()); + if !validator(&l) { + return Err(VortexFuzzError::DomainValidationFailed( + l, + idx, + lhs.clone(), + step, + Backtrace::capture(), + )); + } + } } + Ok(()) } diff --git a/fuzz/src/error.rs b/fuzz/src/error.rs index c28febdd984..e0d5d75abba 100644 --- a/fuzz/src/error.rs +++ b/fuzz/src/error.rs @@ -59,6 +59,8 @@ pub enum VortexFuzzError { LengthMismatch(usize, usize, ArrayRef, ArrayRef, usize, Backtrace), VortexError(VortexError, Backtrace), + + DomainValidationFailed(Scalar, usize, ArrayRef, usize, Backtrace), } impl Debug for VortexFuzzError { @@ -121,6 +123,13 @@ impl Display for VortexFuzzError { rhs.display_tree(), ) } + VortexFuzzError::DomainValidationFailed(expected, idx, lhs, step, backtrace) => { + write!( + f, + "Domain validation failed:\n Scalar: {expected}\n Index: {idx}\n Array: {}\n Step: {step}\nBacktrace:\n{backtrace}", + lhs.display_tree(), + ) + } VortexFuzzError::VortexError(err, backtrace) => { write!(f, "{err}\nBacktrace:\n{backtrace}") } @@ -137,6 +146,7 @@ impl Error for VortexFuzzError { | VortexFuzzError::LengthMismatch(..) | VortexFuzzError::ScalarMismatch(..) | VortexFuzzError::MinMaxMismatch(..) + | VortexFuzzError::DomainValidationFailed(..) | VortexFuzzError::DTypeMismatch(..) => None, } } diff --git a/vortex-array/src/arrays/arbitrary.rs b/vortex-array/src/arrays/arbitrary.rs index e2ba09684ed..171f311838e 100644 --- a/vortex-array/src/arrays/arbitrary.rs +++ b/vortex-array/src/arrays/arbitrary.rs @@ -31,6 +31,7 @@ use crate::IntoArray; use crate::ToCanonical; use crate::arrays::VarBinArray; use crate::arrays::VarBinViewArray; +use crate::arrays::validator_for_ext_type; use crate::builders::ArrayBuilder; use crate::builders::DecimalBuilder; use crate::builders::FixedSizeListBuilder; @@ -170,7 +171,6 @@ fn random_array_chunk( } /// Creates a random extension array. -/// /// If the `chunk_len` is specified, the length of the array will be equal to the chunk length. fn random_extension( u: &mut Unstructured, @@ -179,24 +179,34 @@ fn random_extension( ) -> Result { use crate::builders::ExtensionBuilder; + // Get the validator for this extension type + let validator = validator_for_ext_type(ext_dtype); + // Determine array length let array_length = chunk_len.unwrap_or(u.int_in_range(0..=20)?); // Create builder for the extension array let mut builder = ExtensionBuilder::with_capacity(ext_dtype.clone(), array_length); - // Generate random values + // Generate random values, retrying if they don't pass validation for _ in 0..array_length { - // Wrap in extension scalar using Scalar::extension() - let ext_scalar = Scalar::extension( - ext_dtype.clone(), - random_scalar(u, ext_dtype.storage_dtype())?, - ); - - // Append to builder - builder - .append_scalar(&ext_scalar) - .vortex_expect("can append extension scalar"); + // Retry loop to generate valid values + loop { + // Generate a random storage scalar + let storage_scalar = random_scalar(u, ext_dtype.storage_dtype())?; + + // Wrap it in an extension scalar + let ext_scalar = Scalar::extension(ext_dtype.clone(), storage_scalar); + + // Validate the scalar + if validator(&ext_scalar) { + // Valid value - append and break + builder + .append_scalar(&ext_scalar) + .vortex_expect("can append extension scalar"); + break; + } + } } Ok(builder.finish()) diff --git a/vortex-array/src/arrays/extension/mod.rs b/vortex-array/src/arrays/extension/mod.rs index 10569235442..757fb76347a 100644 --- a/vortex-array/src/arrays/extension/mod.rs +++ b/vortex-array/src/arrays/extension/mod.rs @@ -8,3 +8,6 @@ mod compute; mod vtable; pub use vtable::ExtensionVTable; + +mod validate; +pub use validate::validator_for_ext_type; diff --git a/vortex-array/src/arrays/extension/validate.rs b/vortex-array/src/arrays/extension/validate.rs new file mode 100644 index 00000000000..4869a1989cb --- /dev/null +++ b/vortex-array/src/arrays/extension/validate.rs @@ -0,0 +1,138 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Domain validation for extension types. +//! +//! This module provides validators to ensure that extension array values +//! are always within the valid domain for their type. For example, temporal +//! arrays should not overflow when converted to Jiff types. + +use vortex_dtype::ExtDType; +use vortex_dtype::PType; +use vortex_dtype::datetime::TemporalMetadata; +use vortex_dtype::datetime::is_temporal_ext_type; +use vortex_error::VortexExpect; +use vortex_scalar::Scalar; + +/// Type alias for a domain validator function. +/// +/// A domain validator checks whether a scalar value is valid for a given extension type. +/// For example, temporal extension types validate that values don't overflow when converted to Jiff types. +pub type DomainValidator = Box bool + Send + Sync>; + +/// Creates a domain validator for the given extension type. +/// +/// This function returns a validator that checks if scalar values are in the valid domain +/// for the extension type. For temporal types (date, time, timestamp), it validates that +/// the values can be successfully converted to Jiff types without overflow. +/// +/// # Examples +/// +/// ``` +/// use std::sync::Arc; +/// use vortex_array::arrays::extension::validator_for_ext_type; +/// use vortex_dtype::{ExtDType, ExtMetadata, DType, PType, Nullability}; +/// use vortex_dtype::datetime::{TemporalMetadata, TimeUnit, DATE_ID}; +/// use vortex_scalar::Scalar; +/// +/// let metadata: ExtMetadata = TemporalMetadata::Date(TimeUnit::Days).into(); +/// let ext_dtype = ExtDType::new( +/// DATE_ID.clone(), +/// Arc::new(DType::Primitive(PType::I32, Nullability::NonNullable)), +/// Some(metadata), +/// ); +/// +/// let validator = validator_for_ext_type(&ext_dtype); +/// +/// // Valid date value +/// let valid_scalar = Scalar::extension( +/// Arc::new(ext_dtype.clone()), +/// Scalar::primitive(18000i32, Nullability::NonNullable), +/// ); +/// assert!(validator(&valid_scalar)); +/// +/// // Null is always valid +/// let null_scalar = Scalar::null(DType::Extension(Arc::new(ext_dtype))); +/// assert!(validator(&null_scalar)); +/// ``` +pub fn validator_for_ext_type(ext_dtype: &ExtDType) -> DomainValidator { + if is_temporal_ext_type(ext_dtype.id()) { + // For temporal types, validate that the value can be converted to Jiff + let metadata = TemporalMetadata::try_from(ext_dtype) + .vortex_expect("temporal ext_dtype should have valid metadata"); + + Box::new(move |scalar: &Scalar| { + if scalar.is_null() { + return true; + } + + // Extract the storage value and validate it can be converted to Jiff + let ext_scalar = scalar.as_extension(); + let storage = ext_scalar.storage(); + let primitive = storage.as_primitive(); + + // Get the i64 value from the primitive (temporal types use i32 or i64) + let value = match primitive.ptype() { + PType::I32 => primitive.typed_value::().map(|v| v as i64), + PType::I64 => primitive.typed_value::(), + _ => None, + }; + + value.map(|v| metadata.to_jiff(v).is_ok()).unwrap_or(false) + }) + } else { + // Unknown extension type - accept all values + Box::new(|_| true) + } +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use vortex_dtype::DType; + use vortex_dtype::ExtDType; + use vortex_dtype::ExtMetadata; + use vortex_dtype::Nullability; + use vortex_dtype::PType; + use vortex_dtype::datetime::DATE_ID; + use vortex_dtype::datetime::TemporalMetadata; + use vortex_dtype::datetime::TimeUnit; + use vortex_scalar::Scalar; + + use super::*; + + #[test] + fn test_temporal_validator_accepts_valid_values() { + let metadata: ExtMetadata = TemporalMetadata::Date(TimeUnit::Days).into(); + let ext_dtype = ExtDType::new( + DATE_ID.clone(), + Arc::new(DType::Primitive(PType::I32, Nullability::NonNullable)), + Some(metadata), + ); + + let validator = validator_for_ext_type(&ext_dtype); + + // Valid date (days since epoch) + let valid_scalar = Scalar::extension( + Arc::new(ext_dtype.clone()), + Scalar::primitive(18000i32, Nullability::NonNullable), + ); + assert!(validator(&valid_scalar)); + } + + #[test] + fn test_temporal_validator_accepts_null() { + let metadata: ExtMetadata = TemporalMetadata::Date(TimeUnit::Days).into(); + let ext_dtype = ExtDType::new( + DATE_ID.clone(), + Arc::new(DType::Primitive(PType::I32, Nullability::Nullable)), + Some(metadata), + ); + + let validator = validator_for_ext_type(&ext_dtype); + + let null_scalar = Scalar::null(DType::Extension(Arc::new(ext_dtype))); + assert!(validator(&null_scalar)); + } +} diff --git a/vortex-dtype/src/arbitrary/mod.rs b/vortex-dtype/src/arbitrary/mod.rs index d159c157e87..70a0af05619 100644 --- a/vortex-dtype/src/arbitrary/mod.rs +++ b/vortex-dtype/src/arbitrary/mod.rs @@ -10,13 +10,17 @@ use arbitrary::Unstructured; use crate::DType; use crate::DecimalDType; use crate::ExtDType; -use crate::ExtID; use crate::FieldName; use crate::FieldNames; use crate::NativeDecimalType; use crate::Nullability; use crate::PType; use crate::StructFields; +use crate::datetime::DATE_ID; +use crate::datetime::TIME_ID; +use crate::datetime::TIMESTAMP_ID; +use crate::datetime::TemporalMetadata; +use crate::datetime::TimeUnit; use crate::i256; mod decimal; @@ -112,24 +116,65 @@ impl<'a> Arbitrary<'a> for ExtDType { } fn random_ext_dtype(u: &mut Unstructured<'_>, _depth: u8) -> Result { - let id_str = u.choose(&[ - "test.ext", - "example.currency", - "example.uuid", - "example.ipv4", - ])?; - let ext_id = ExtID::from(*id_str); - - // Supports only base types for now - let storage_dtype = match u.int_in_range(1..=3)? { - // base types - 1 => DType::Bool(u.arbitrary()?), - 2 => DType::Primitive(u.arbitrary()?, u.arbitrary()?), - 3 => DType::Decimal(u.arbitrary()?, u.arbitrary()?), - _ => unreachable!("int_in_range(1..=3) returned value out of range"), - }; - - Ok(ExtDType::new(ext_id, storage_dtype.into(), None)) + let choice = u.int_in_range(0..=2)?; + + match choice { + 0 => { + // DATE: i32 (Days) or i64 (Milliseconds) + let (ptype, time_unit) = match u.int_in_range(0..=1)? { + 0 => (PType::I32, TimeUnit::Days), + 1 => (PType::I64, TimeUnit::Milliseconds), + _ => unreachable!(), + }; + + Ok(ExtDType::new( + DATE_ID.clone(), + DType::Primitive(ptype, u.arbitrary()?).into(), + Some(TemporalMetadata::Date(time_unit).into()), + )) + } + 1 => { + // TIME: i32 for Seconds/Milliseconds, i64 for Microseconds/Nanoseconds + let (ptype, time_unit) = match u.int_in_range(0..=3)? { + 0 => (PType::I32, TimeUnit::Seconds), + 1 => (PType::I32, TimeUnit::Milliseconds), + 2 => (PType::I64, TimeUnit::Microseconds), + 3 => (PType::I64, TimeUnit::Nanoseconds), + _ => unreachable!(), + }; + + Ok(ExtDType::new( + TIME_ID.clone(), + DType::Primitive(ptype, u.arbitrary()?).into(), + Some(TemporalMetadata::Time(time_unit).into()), + )) + } + 2 => { + // TIMESTAMP: always i64 with time unit and optional timezone + let time_unit = match u.int_in_range(0..=3)? { + 0 => TimeUnit::Seconds, + 1 => TimeUnit::Milliseconds, + 2 => TimeUnit::Microseconds, + 3 => TimeUnit::Nanoseconds, + _ => unreachable!(), + }; + + let time_zone = u + .arbitrary::()? + .then(|| { + u.choose(&["UTC", "America/New_York", "Europe/London", "Asia/Tokyo"]) + .map(|s| s.to_string()) + }) + .transpose()?; + + Ok(ExtDType::new( + TIMESTAMP_ID.clone(), + DType::Primitive(PType::I64, u.arbitrary()?).into(), + Some(TemporalMetadata::Timestamp(time_unit, time_zone).into()), + )) + } + _ => unreachable!(), + } } impl<'a> Arbitrary<'a> for StructFields {