diff --git a/vortex-array/public-api.lock b/vortex-array/public-api.lock index c5184c008cd..b57fb580df5 100644 --- a/vortex-array/public-api.lock +++ b/vortex-array/public-api.lock @@ -4444,8 +4444,14 @@ pub vortex_array::arrays::listview::ListViewDataParts::sizes: vortex_array::Arra pub vortex_array::arrays::listview::ListViewDataParts::validity: vortex_array::validity::Validity +pub const vortex_array::arrays::listview::DEFAULT_REBUILD_DENSITY_THRESHOLD: f32 + pub trait vortex_array::arrays::listview::ListViewArrayExt: vortex_array::TypedArrayRef +pub fn vortex_array::arrays::listview::ListViewArrayExt::compute_density(&self, &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult + +pub fn vortex_array::arrays::listview::ListViewArrayExt::compute_referenced_elements_mask(&self, &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult + pub fn vortex_array::arrays::listview::ListViewArrayExt::elements(&self) -> &vortex_array::ArrayRef pub fn vortex_array::arrays::listview::ListViewArrayExt::list_elements_at(&self, usize) -> vortex_error::VortexResult @@ -4462,10 +4468,16 @@ pub fn vortex_array::arrays::listview::ListViewArrayExt::size_at(&self, usize) - pub fn vortex_array::arrays::listview::ListViewArrayExt::sizes(&self) -> &vortex_array::ArrayRef +pub fn vortex_array::arrays::listview::ListViewArrayExt::upper_bound_density(&self, &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult + pub fn vortex_array::arrays::listview::ListViewArrayExt::verify_is_zero_copy_to_list(&self) -> bool impl> vortex_array::arrays::listview::ListViewArrayExt for T +pub fn T::compute_density(&self, &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult + +pub fn T::compute_referenced_elements_mask(&self, &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult + pub fn T::elements(&self) -> &vortex_array::ArrayRef pub fn T::list_elements_at(&self, usize) -> vortex_error::VortexResult @@ -4482,6 +4494,8 @@ pub fn T::size_at(&self, usize) -> usize pub fn T::sizes(&self) -> &vortex_array::ArrayRef +pub fn T::upper_bound_density(&self, &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult + pub fn T::verify_is_zero_copy_to_list(&self) -> bool pub fn vortex_array::arrays::listview::list_from_list_view(vortex_array::arrays::ListViewArray) -> vortex_error::VortexResult diff --git a/vortex-array/src/arrays/filter/execute/listview.rs b/vortex-array/src/arrays/filter/execute/listview.rs index a9849383ec4..b3f787133ab 100644 --- a/vortex-array/src/arrays/filter/execute/listview.rs +++ b/vortex-array/src/arrays/filter/execute/listview.rs @@ -9,9 +9,7 @@ use vortex_mask::MaskValues; use crate::arrays::ListViewArray; use crate::arrays::filter::execute::filter_validity; -use crate::arrays::listview; use crate::arrays::listview::ListViewArrayExt; -use crate::arrays::listview::ListViewRebuildMode; /// [`ListViewArray`] filter implementation. /// @@ -55,18 +53,7 @@ pub fn filter_listview(array: &ListViewArray, selection_mask: &Arc) // - Offsets and sizes are derived from existing valid child arrays. // - Offsets and sizes have the same length (both filtered by `selection_mask`). // - Validity matches the filtered array's nullability. - let new_array = unsafe { - ListViewArray::new_unchecked(elements.clone(), new_offsets, new_sizes, new_validity) - }; - - let kept_row_fraction = selection_mask.true_count() as f32 / array.sizes().len() as f32; - if kept_row_fraction < listview::compute::REBUILD_DENSITY_THRESHOLD { - new_array - .rebuild(ListViewRebuildMode::MakeZeroCopyToList) - .vortex_expect("ListViewArray rebuild to zero-copy List should always succeed") - } else { - new_array - } + unsafe { ListViewArray::new_unchecked(elements.clone(), new_offsets, new_sizes, new_validity) } } #[cfg(test)] diff --git a/vortex-array/src/arrays/listview/array.rs b/vortex-array/src/arrays/listview/array.rs index 5c9350ed072..3e7aedc5f5e 100644 --- a/vortex-array/src/arrays/listview/array.rs +++ b/vortex-array/src/arrays/listview/array.rs @@ -7,14 +7,17 @@ use std::sync::Arc; use num_traits::AsPrimitive; use smallvec::smallvec; +use vortex_buffer::BitBufferMut; use vortex_error::VortexExpect; use vortex_error::VortexResult; use vortex_error::vortex_bail; use vortex_error::vortex_ensure; use vortex_error::vortex_err; +use vortex_mask::Mask; use crate::ArrayRef; use crate::ArraySlots; +use crate::ExecutionCtx; use crate::LEGACY_SESSION; #[expect(deprecated)] use crate::ToCanonical as _; @@ -30,6 +33,7 @@ use crate::arrays::PrimitiveArray; use crate::arrays::bool; use crate::dtype::DType; use crate::dtype::IntegerPType; +use crate::expr::stats::Stat; use crate::match_each_integer_ptype; use crate::validity::Validity; @@ -311,6 +315,31 @@ impl Default for ListViewData { } } +/// Walks parallel `(offset, size)` slices and sets each range `[offset, offset + size]` in `buf`. +/// +/// **Preconditions** +/// +/// `offsets` and `sizes` must be the same length (which is always the case in valid `ListViewArray`s). +fn fill_referenced_mask( + buf: &mut BitBufferMut, + offsets: &[O], + sizes: &[S], +) { + let len = offsets.len(); + + assert_eq!( + len, + sizes.len(), + "offsets and sizes must be the same length" + ); + + for i in 0..len { + let start: usize = offsets[i].as_(); + let size: usize = sizes[i].as_(); + buf.fill_range(start, start + size, true); + } +} + pub trait ListViewArrayExt: TypedArrayRef { fn nullability(&self) -> crate::dtype::Nullability { match self.as_ref().dtype() { @@ -396,6 +425,94 @@ pub trait ListViewArrayExt: TypedArrayRef { let sizes_primitive = self.sizes().to_primitive(); validate_zctl(self.elements(), offsets_primitive, sizes_primitive).is_ok() } + + /// Returns a [`Mask`] of length `elements.len()` where each bit is set iff that + /// position in `elements` is referenced by at least one view. Caller must ensure `elements` + /// is non-empty. + /// + /// Walks every `(offset, size)` pair, canonicalizes both `offsets` and `sizes`, + /// and allocates a `BitBuffer` of length `elements.len()`, so it is extremely costly. + /// + /// **Preconditions** + /// + /// `self.elements()` must be non-empty. + fn compute_referenced_elements_mask(&self, ctx: &mut ExecutionCtx) -> VortexResult { + assert!(!self.elements().is_empty()); + let len = self.elements().len(); + + let offsets_primitive = self.offsets().clone().execute::(ctx)?; + let sizes_primitive = self.sizes().clone().execute::(ctx)?; + + let mut buf = BitBufferMut::new_unset(len); + + match_each_integer_ptype!(offsets_primitive.ptype(), |O| { + match_each_integer_ptype!(sizes_primitive.ptype(), |S| { + fill_referenced_mask::( + &mut buf, + offsets_primitive.as_slice::(), + sizes_primitive.as_slice::(), + ); + }) + }); + + Ok(Mask::from_buffer(buf.freeze())) + } + + /// Exact fraction of `elements` referenced by some view, in `[0.0, 1.0]`. Extremely costly. + /// + /// Returns `Ok(1.0)` when `elements` is empty instead of dividing by 0. + fn compute_density(&self, ctx: &mut ExecutionCtx) -> VortexResult { + if self.elements().is_empty() { + return Ok(1.0); + } + + if self.sizes().is_empty() { + return Ok(0.0); + } + + let density = match self.compute_referenced_elements_mask(ctx)? { + Mask::AllTrue(_) => 1.0, + Mask::AllFalse(_) => 0.0, + Mask::Values(values) => values.true_count() as f32 / self.elements().len() as f32, + }; + + Ok(density) + } + + /// Upper-bound estimate of [`compute_density`](Self::compute_density) via + /// `sum(sizes) / elements.len()`, clamped to `[0.0, 1.0]`. + /// + /// Exact for non-overlapping views, but overcounts when multiple views share the same elements. + /// + /// Returns `Ok(1.0)` when `elements` is empty instead of dividing by 0. + fn upper_bound_density(&self, ctx: &mut ExecutionCtx) -> VortexResult { + let n_elts = self.elements().len(); + if n_elts == 0 { + return Ok(1.0); + } + + let sizes = self.sizes(); + if sizes.is_empty() { + return Ok(0.0); + } + + // compute_stat short-circuits on a cached exact Sum and otherwise computes + let sizes_sum = sizes + .statistics() + .compute_stat(Stat::Sum, ctx)? + .vortex_expect("sizes array has integer ptype elements") + .as_primitive() + .as_::() + .vortex_expect("integer ptypes can be upcast to u64"); + + // if the same elements are referenced more than once the estimate may be + // greater than 1.0, so clamp + let estimate = (sizes_sum as f32 / n_elts as f32).min(1.0); + + debug_assert!(estimate >= 0.0); + + Ok(estimate) + } } impl> ListViewArrayExt for T {} diff --git a/vortex-array/src/arrays/listview/compute/mod.rs b/vortex-array/src/arrays/listview/compute/mod.rs index 3ea82cafb33..9a43503c4b5 100644 --- a/vortex-array/src/arrays/listview/compute/mod.rs +++ b/vortex-array/src/arrays/listview/compute/mod.rs @@ -6,15 +6,3 @@ mod mask; pub(crate) mod rules; mod slice; mod take; - -/// The threshold below which we rebuild the elements of a listview. -/// -/// We don't touch `elements` on the metadata-only path since reorganizing it can be expensive. -/// However, we also don't want to drag around a large amount of garbage data when the selection -/// is sparse. Below this fraction of list rows retained, the rebuild is worth it. -/// Rebuilding is needed when exporting the ListView's elements. -/// -// TODO(connor)[ListView]: Ideally, we would only rebuild after all `take`s and `filter` -// compute functions have run, at the "top" of the operator tree. However, we cannot do this -// right now, so we will just rebuild every time (similar to [`ListArray`]). -pub(crate) const REBUILD_DENSITY_THRESHOLD: f32 = 0.1; diff --git a/vortex-array/src/arrays/listview/compute/take.rs b/vortex-array/src/arrays/listview/compute/take.rs index 04e404a846e..2b6c016d2c3 100644 --- a/vortex-array/src/arrays/listview/compute/take.rs +++ b/vortex-array/src/arrays/listview/compute/take.rs @@ -4,7 +4,6 @@ use num_traits::Zero; use vortex_error::VortexResult; -use super::REBUILD_DENSITY_THRESHOLD; use crate::ArrayRef; use crate::ExecutionCtx; use crate::IntoArray; @@ -14,7 +13,6 @@ use crate::arrays::ListViewArray; use crate::arrays::dict::TakeExecute; use crate::arrays::dict::TakeReduce; use crate::arrays::listview::ListViewArrayExt; -use crate::arrays::listview::ListViewRebuildMode; use crate::builtins::ArrayBuiltins; use crate::dtype::Nullability; use crate::match_each_integer_ptype; @@ -23,43 +21,18 @@ use crate::scalar::Scalar; /// Metadata-only take for [`ListViewArray`]. impl TakeReduce for ListView { fn take(array: ArrayView<'_, ListView>, indices: &ArrayRef) -> VortexResult> { - // Approximate element density by the fraction of list rows retained. Assumes roughly - // uniform list sizes; good enough to decide whether dragging along the full `elements` - // buffer is worth avoiding a rebuild. - let kept_row_fraction = indices.len() as f32 / array.sizes().len() as f32; - if kept_row_fraction < REBUILD_DENSITY_THRESHOLD { - return Ok(None); - } - Ok(Some(apply_take(array, indices)?.into_array())) } } /// Execution-path take for [`ListViewArray`]. -/// -/// This does the same metadata-only take as [`TakeReduce`], but also rebuilds the array if the -/// resulting array will be less dense than `REBUILD_DENSITY_THRESHOLD`. impl TakeExecute for ListView { fn take( array: ArrayView<'_, ListView>, indices: &ArrayRef, _ctx: &mut ExecutionCtx, ) -> VortexResult> { - let kept_row_fraction = indices.len() as f32 / array.sizes().len() as f32; - let taken = apply_take(array, indices)?; - - if kept_row_fraction < REBUILD_DENSITY_THRESHOLD { - // TODO(connor)[ListView]: Ideally, we would only rebuild after all `take`s and `filter` - // compute functions have run, at the "top" of the operator tree. However, we cannot do - // this right now, so we will just rebuild every time (similar to `ListArray`). - Ok(Some( - taken - .rebuild(ListViewRebuildMode::MakeZeroCopyToList)? - .into_array(), - )) - } else { - Ok(Some(taken.into_array())) - } + Ok(Some(apply_take(array, indices)?.into_array())) } } diff --git a/vortex-array/src/arrays/listview/mod.rs b/vortex-array/src/arrays/listview/mod.rs index 77f2a279160..29dc50f9b9f 100644 --- a/vortex-array/src/arrays/listview/mod.rs +++ b/vortex-array/src/arrays/listview/mod.rs @@ -18,6 +18,7 @@ pub use conversion::list_view_from_list; pub use conversion::recursive_list_from_list_view; mod rebuild; +pub use rebuild::DEFAULT_REBUILD_DENSITY_THRESHOLD; pub use rebuild::ListViewRebuildMode; #[cfg(test)] diff --git a/vortex-array/src/arrays/listview/rebuild.rs b/vortex-array/src/arrays/listview/rebuild.rs index 416d73bb92a..0082cad427a 100644 --- a/vortex-array/src/arrays/listview/rebuild.rs +++ b/vortex-array/src/arrays/listview/rebuild.rs @@ -25,6 +25,16 @@ use crate::match_each_integer_ptype; use crate::scalar::Scalar; use crate::scalar_fn::fns::operators::Operator; +/// Density threshold to decide whether to rebuild a sparse `ListViewArray`. +/// +/// A `ListViewArray` can accumulate unreferenced bytes in its `elements` buffer after +/// metadata-only operations like `take` and `filter`. When density (referenced fraction of `elements`) +/// falls below this threshold, the benefits of a rebuild may outweigh its cost. +/// +/// This is a somewhat arbitrary rule-of-thumb and may be suboptimal depending on different use cases and +/// list element dtypes. +pub const DEFAULT_REBUILD_DENSITY_THRESHOLD: f32 = 0.1; + /// Modes for rebuilding a [`ListViewArray`]. pub enum ListViewRebuildMode { /// Removes all unused data and flattens out all list data, such that the array is zero-copyable diff --git a/vortex-array/src/arrays/listview/tests/common.rs b/vortex-array/src/arrays/listview/tests/common.rs index d8f25f522cf..dc8bc94fb18 100644 --- a/vortex-array/src/arrays/listview/tests/common.rs +++ b/vortex-array/src/arrays/listview/tests/common.rs @@ -22,6 +22,15 @@ pub fn create_basic_listview() -> ListViewArray { } } +/// Creates a sparse ListView with two overlap regions +/// `[[0,1,2], [1,2], [18, 19], [19]]` over 20 elements. +pub fn create_sparse_overlapping_listview() -> ListViewArray { + let elements = buffer![0i32..20].into_array(); + let offsets = buffer![0u32, 1, 18, 19].into_array(); + let sizes = buffer![3u32, 2, 2, 1].into_array(); + ListViewArray::new(elements, offsets, sizes, Validity::NonNullable) +} + /// Creates a nullable ListView: [[10,20], null, [50]] pub fn create_nullable_listview() -> ListViewArray { let elements = buffer![10i32, 20, 30, 40, 50].into_array(); @@ -45,6 +54,17 @@ pub fn create_empty_lists_listview() -> ListViewArray { } } +/// Creates a ListView with empty lists and elements: [[]] +pub fn create_empty_elements_listview() -> ListViewArray { + let elements = PrimitiveArray::from_iter::<[i32; 0]>([]).into_array(); + let offsets = buffer![0u32; 0].into_array(); + let sizes = buffer![0u32; 0].into_array(); + unsafe { + ListViewArray::new_unchecked(elements, offsets, sizes, Validity::NonNullable) + .with_zero_copy_to_list(true) + } +} + /// Creates a ListView with overlapping lists and out-of-order offsets /// Lists: [[5,6,7], [2,3], [8,9], [0,1], [1,2,3,4]] pub fn create_overlapping_listview() -> ListViewArray { diff --git a/vortex-array/src/arrays/listview/tests/density.rs b/vortex-array/src/arrays/listview/tests/density.rs new file mode 100644 index 00000000000..3799bc9612c --- /dev/null +++ b/vortex-array/src/arrays/listview/tests/density.rs @@ -0,0 +1,139 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Tests for `compute_referenced_elements_mask`, `compute_density`, and +//! `estimate_density` on `ListViewArray`. + +use vortex_error::VortexResult; +use vortex_mask::Mask; +use vortex_session::VortexSession; + +use super::common::create_basic_listview; +use super::common::create_empty_lists_listview; +use super::common::create_large_listview; +use super::common::create_overlapping_listview; +use super::common::create_sparse_overlapping_listview; +use crate::ExecutionCtx; +use crate::VortexSessionExecute; +use crate::arrays::listview::ListViewArrayExt; +use crate::arrays::listview::tests::common::create_empty_elements_listview; +use crate::expr::stats::Precision; +use crate::expr::stats::Stat; +use crate::scalar::ScalarValue; +use crate::session::ArraySession; + +const EPS: f32 = 1e-6; + +fn test_execution_ctx() -> ExecutionCtx { + let session = VortexSession::empty().with::(); + session.create_execution_ctx() +} + +#[test] +fn full_density_no_overlap() -> VortexResult<()> { + let mut ctx = test_execution_ctx(); + let lv = create_basic_listview(); + let exact = lv.compute_density(&mut ctx)?; + let est = lv.upper_bound_density(&mut ctx)?; + + assert!((exact - 1.0).abs() < EPS); + assert!((est - 1.0).abs() < EPS); + Ok(()) +} + +#[test] +fn sparse_no_overlap_matches_exact() -> VortexResult<()> { + let mut ctx = test_execution_ctx(); + let lv = create_large_listview(); + let exact = lv.compute_density(&mut ctx)?; + let est = lv.upper_bound_density(&mut ctx)?; + + assert!((exact - 0.5).abs() < EPS); + assert!((est - 0.5).abs() < EPS); + Ok(()) +} + +#[test] +fn all_empty_lists_is_zero_density() -> VortexResult<()> { + let mut ctx = test_execution_ctx(); + let lv = create_empty_lists_listview(); + let exact = lv.compute_density(&mut ctx)?; + let est = lv.upper_bound_density(&mut ctx)?; + + assert_eq!(exact, 0.0); + assert_eq!(est, 0.0); + Ok(()) +} + +#[test] +fn overlap_full_coverage_clamps_estimate() -> VortexResult<()> { + let mut ctx = test_execution_ctx(); + let lv = create_overlapping_listview(); + let exact = lv.compute_density(&mut ctx)?; + let est = lv.upper_bound_density(&mut ctx)?; + + assert!((exact - 1.0).abs() < EPS); + assert!((est - 1.0).abs() < EPS); + Ok(()) +} + +#[test] +fn overlap_differential_exact_lower_than_estimate() -> VortexResult<()> { + let mut ctx = test_execution_ctx(); + let lv = create_sparse_overlapping_listview(); + + let exact = lv.compute_density(&mut ctx)?; + let est = lv.upper_bound_density(&mut ctx)?; + + assert!((exact - 0.25).abs() < EPS); + assert!((est - 0.40).abs() < EPS); + Ok(()) +} + +#[test] +fn empty_elements_returns_one() -> VortexResult<()> { + let mut ctx = test_execution_ctx(); + let lv = create_empty_elements_listview(); + + let exact = lv.compute_density(&mut ctx)?; + let est = lv.upper_bound_density(&mut ctx)?; + + assert!((exact - 1.0).abs() < EPS); + assert!((est - 1.0).abs() < EPS); + Ok(()) +} + +#[test] +fn estimate_uses_cached_sum_stat() -> VortexResult<()> { + let mut ctx = test_execution_ctx(); + let lv = create_basic_listview(); + // Pre-populate Stat::Sum with a deliberately-wrong 5 so we can prove + // estimate_density reads from the cache instead of computing fresh. + lv.sizes() + .statistics() + .set(Stat::Sum, Precision::Exact(ScalarValue::from(5u64))); + + let est = lv.upper_bound_density(&mut ctx)?; + assert!((est - 0.5).abs() < EPS); + Ok(()) +} + +#[test] +fn referenced_mask_set_bits_match_views() -> VortexResult<()> { + let mut ctx = test_execution_ctx(); + let lv = create_sparse_overlapping_listview(); + let mask = lv.compute_referenced_elements_mask(&mut ctx)?; + let bits = match mask { + Mask::Values(v) => v, + _ => panic!("expected Values mask"), + }; + + assert_eq!(bits.true_count(), 5); + let bb = bits.bit_buffer(); + for i in 0..3 { + assert!(bb.value(i)); + } + assert!(bb.value(18)); + assert!(bb.value(19)); + Ok(()) +} diff --git a/vortex-array/src/arrays/listview/tests/mod.rs b/vortex-array/src/arrays/listview/tests/mod.rs index dea78fd8a97..5e0c357282e 100644 --- a/vortex-array/src/arrays/listview/tests/mod.rs +++ b/vortex-array/src/arrays/listview/tests/mod.rs @@ -4,6 +4,7 @@ pub(super) mod common; mod basic; +mod density; mod filter; mod nested; mod nullability; diff --git a/vortex-array/src/arrow/executor/list_view.rs b/vortex-array/src/arrow/executor/list_view.rs index ef858fa9916..53d06863557 100644 --- a/vortex-array/src/arrow/executor/list_view.rs +++ b/vortex-array/src/arrow/executor/list_view.rs @@ -11,10 +11,12 @@ use vortex_error::vortex_ensure; use crate::ArrayRef; use crate::ExecutionCtx; -use crate::arrays::ListView; use crate::arrays::ListViewArray; use crate::arrays::PrimitiveArray; +use crate::arrays::listview::DEFAULT_REBUILD_DENSITY_THRESHOLD; +use crate::arrays::listview::ListViewArrayExt; use crate::arrays::listview::ListViewDataParts; +use crate::arrays::listview::ListViewRebuildMode; use crate::arrow::executor::validity::to_arrow_null_buffer; use crate::arrow::session::ArrowSessionExt; use crate::builtins::ArrayBuiltins; @@ -27,15 +29,19 @@ pub(super) fn to_arrow_list_view( elements_field: &FieldRef, ctx: &mut ExecutionCtx, ) -> VortexResult { - // Check for Vortex ListViewArray and convert directly. - let array = match array.try_downcast::() { - Ok(array) => return list_view_to_list_view::(array, elements_field, ctx), - Err(array) => array, + let array = array.execute::(ctx)?; + + // If the array is sufficiently sparse, rebuild before handing it to Arrow. Otherwise downstream + // consumers hold an elements buffer containing unreferenced data in memory indefinitely, + // and any compute pass over that buffer wastes work on data nothing references. + let density = array.upper_bound_density(ctx)?; + let array = if density < DEFAULT_REBUILD_DENSITY_THRESHOLD { + array.rebuild(ListViewRebuildMode::MakeZeroCopyToList)? + } else { + array }; - // Otherwise, we execute to ListViewArray and convert. - let list_view_array = array.execute::(ctx)?; - list_view_to_list_view::(list_view_array, elements_field, ctx) + list_view_to_list_view::(array, elements_field, ctx) } fn list_view_to_list_view( diff --git a/vortex-array/src/stats/array.rs b/vortex-array/src/stats/array.rs index fd41090c528..8c3e331dc15 100644 --- a/vortex-array/src/stats/array.rs +++ b/vortex-array/src/stats/array.rs @@ -153,6 +153,8 @@ impl StatsSetRef<'_> { f(&mut lock.iter()) } + /// Returns the value of `stat` by either fetching it from cache if it exists and is [`Precision::Exact`], or falling back to + /// computation. The underlying compute kernels will cache the computed stat in the latter case. pub fn compute_stat(&self, stat: Stat, ctx: &mut ExecutionCtx) -> VortexResult> { // If it's already computed and exact, we can return it. if let Some(Precision::Exact(s)) = self.get(stat) { diff --git a/vortex-duckdb/src/exporter/list_view.rs b/vortex-duckdb/src/exporter/list_view.rs index 0a3c718b465..816294bb923 100644 --- a/vortex-duckdb/src/exporter/list_view.rs +++ b/vortex-duckdb/src/exporter/list_view.rs @@ -10,7 +10,10 @@ use parking_lot::Mutex; use vortex::array::ExecutionCtx; use vortex::array::arrays::ListViewArray; use vortex::array::arrays::PrimitiveArray; +use vortex::array::arrays::listview::DEFAULT_REBUILD_DENSITY_THRESHOLD; +use vortex::array::arrays::listview::ListViewArrayExt; use vortex::array::arrays::listview::ListViewDataParts; +use vortex::array::arrays::listview::ListViewRebuildMode; use vortex::array::match_each_integer_ptype; use vortex::array::validity::Validity; use vortex::dtype::IntegerPType; @@ -49,7 +52,18 @@ pub(crate) fn new_exporter( cache: &ConversionCache, ctx: &mut ExecutionCtx, ) -> VortexResult> { + // If the array is sufficiently sparse, rebuild. Otherwise the DuckDB vector will + // hold an elements buffer containing unreferenced data in memory indefinitely, + // and any compute pass over that buffer wastes work on data nothing references. + let density = array.upper_bound_density(ctx)?; + let array = if density < DEFAULT_REBUILD_DENSITY_THRESHOLD { + array.rebuild(ListViewRebuildMode::MakeZeroCopyToList)? + } else { + array + }; + let len = array.len(); + let ListViewDataParts { elements_dtype, elements,