Skip to content
14 changes: 14 additions & 0 deletions vortex-array/public-api.lock
Original file line number Diff line number Diff line change
Expand Up @@ -4444,8 +4444,14 @@ pub vortex_array::arrays::listview::ListViewDataParts::sizes: vortex_array::Arra

pub vortex_array::arrays::listview::ListViewDataParts::validity: vortex_array::validity::Validity

pub const vortex_array::arrays::listview::DEFAULT_REBUILD_DENSITY_THRESHOLD: f32

pub trait vortex_array::arrays::listview::ListViewArrayExt: vortex_array::TypedArrayRef<vortex_array::arrays::ListView>

pub fn vortex_array::arrays::listview::ListViewArrayExt::compute_density(&self, &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult<f32>

pub fn vortex_array::arrays::listview::ListViewArrayExt::compute_referenced_elements_mask(&self, &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult<vortex_mask::Mask>

pub fn vortex_array::arrays::listview::ListViewArrayExt::elements(&self) -> &vortex_array::ArrayRef

pub fn vortex_array::arrays::listview::ListViewArrayExt::list_elements_at(&self, usize) -> vortex_error::VortexResult<vortex_array::ArrayRef>
Expand All @@ -4462,10 +4468,16 @@ pub fn vortex_array::arrays::listview::ListViewArrayExt::size_at(&self, usize) -

pub fn vortex_array::arrays::listview::ListViewArrayExt::sizes(&self) -> &vortex_array::ArrayRef

pub fn vortex_array::arrays::listview::ListViewArrayExt::upper_bound_density(&self, &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult<f32>

pub fn vortex_array::arrays::listview::ListViewArrayExt::verify_is_zero_copy_to_list(&self) -> bool

impl<T: vortex_array::TypedArrayRef<vortex_array::arrays::ListView>> vortex_array::arrays::listview::ListViewArrayExt for T

pub fn T::compute_density(&self, &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult<f32>

pub fn T::compute_referenced_elements_mask(&self, &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult<vortex_mask::Mask>

pub fn T::elements(&self) -> &vortex_array::ArrayRef

pub fn T::list_elements_at(&self, usize) -> vortex_error::VortexResult<vortex_array::ArrayRef>
Expand All @@ -4482,6 +4494,8 @@ pub fn T::size_at(&self, usize) -> usize

pub fn T::sizes(&self) -> &vortex_array::ArrayRef

pub fn T::upper_bound_density(&self, &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult<f32>

pub fn T::verify_is_zero_copy_to_list(&self) -> bool

pub fn vortex_array::arrays::listview::list_from_list_view(vortex_array::arrays::ListViewArray) -> vortex_error::VortexResult<vortex_array::arrays::ListArray>
Expand Down
15 changes: 1 addition & 14 deletions vortex-array/src/arrays/filter/execute/listview.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,7 @@ use vortex_mask::MaskValues;

use crate::arrays::ListViewArray;
use crate::arrays::filter::execute::filter_validity;
use crate::arrays::listview;
use crate::arrays::listview::ListViewArrayExt;
use crate::arrays::listview::ListViewRebuildMode;

/// [`ListViewArray`] filter implementation.
///
Expand Down Expand Up @@ -55,18 +53,7 @@ pub fn filter_listview(array: &ListViewArray, selection_mask: &Arc<MaskValues>)
// - Offsets and sizes are derived from existing valid child arrays.
// - Offsets and sizes have the same length (both filtered by `selection_mask`).
// - Validity matches the filtered array's nullability.
let new_array = unsafe {
ListViewArray::new_unchecked(elements.clone(), new_offsets, new_sizes, new_validity)
};

let kept_row_fraction = selection_mask.true_count() as f32 / array.sizes().len() as f32;
if kept_row_fraction < listview::compute::REBUILD_DENSITY_THRESHOLD {
new_array
.rebuild(ListViewRebuildMode::MakeZeroCopyToList)
.vortex_expect("ListViewArray rebuild to zero-copy List should always succeed")
} else {
new_array
}
unsafe { ListViewArray::new_unchecked(elements.clone(), new_offsets, new_sizes, new_validity) }
}

#[cfg(test)]
Expand Down
117 changes: 117 additions & 0 deletions vortex-array/src/arrays/listview/array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,17 @@ use std::sync::Arc;

use num_traits::AsPrimitive;
use smallvec::smallvec;
use vortex_buffer::BitBufferMut;
use vortex_error::VortexExpect;
use vortex_error::VortexResult;
use vortex_error::vortex_bail;
use vortex_error::vortex_ensure;
use vortex_error::vortex_err;
use vortex_mask::Mask;

use crate::ArrayRef;
use crate::ArraySlots;
use crate::ExecutionCtx;
use crate::LEGACY_SESSION;
#[expect(deprecated)]
use crate::ToCanonical as _;
Expand All @@ -30,6 +33,7 @@ use crate::arrays::PrimitiveArray;
use crate::arrays::bool;
use crate::dtype::DType;
use crate::dtype::IntegerPType;
use crate::expr::stats::Stat;
use crate::match_each_integer_ptype;
use crate::validity::Validity;

Expand Down Expand Up @@ -311,6 +315,31 @@ impl Default for ListViewData {
}
}

/// Walks parallel `(offset, size)` slices and sets each range `[offset, offset + size]` in `buf`.
///
/// **Preconditions**
Copy link
Copy Markdown
Contributor

@connortsui20 connortsui20 May 22, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: I think the Rust idiomatic practice is to label this # Panics

https://rust-lang.github.io/rust-clippy/stable/index.html?search=clippy%3A%3Amissing_panics_doc#missing_panics_doc

(also below)

///
/// `offsets` and `sizes` must be the same length (which is always the case in valid `ListViewArray`s).
fn fill_referenced_mask<O: IntegerPType, S: IntegerPType>(
buf: &mut BitBufferMut,
offsets: &[O],
sizes: &[S],
) {
let len = offsets.len();

assert_eq!(
len,
sizes.len(),
"offsets and sizes must be the same length"
);

for i in 0..len {
let start: usize = offsets[i].as_();
let size: usize = sizes[i].as_();
buf.fill_range(start, start + size, true);
}
}

pub trait ListViewArrayExt: TypedArrayRef<ListView> {
fn nullability(&self) -> crate::dtype::Nullability {
match self.as_ref().dtype() {
Expand Down Expand Up @@ -396,6 +425,94 @@ pub trait ListViewArrayExt: TypedArrayRef<ListView> {
let sizes_primitive = self.sizes().to_primitive();
validate_zctl(self.elements(), offsets_primitive, sizes_primitive).is_ok()
}

/// Returns a [`Mask`] of length `elements.len()` where each bit is set iff that
/// position in `elements` is referenced by at least one view. Caller must ensure `elements`
/// is non-empty.
///
/// Walks every `(offset, size)` pair, canonicalizes both `offsets` and `sizes`,
/// and allocates a `BitBuffer` of length `elements.len()`, so it is extremely costly.
///
/// **Preconditions**
///
/// `self.elements()` must be non-empty.
fn compute_referenced_elements_mask(&self, ctx: &mut ExecutionCtx) -> VortexResult<Mask> {
assert!(!self.elements().is_empty());
let len = self.elements().len();

let offsets_primitive = self.offsets().clone().execute::<PrimitiveArray>(ctx)?;
let sizes_primitive = self.sizes().clone().execute::<PrimitiveArray>(ctx)?;

let mut buf = BitBufferMut::new_unset(len);

match_each_integer_ptype!(offsets_primitive.ptype(), |O| {
match_each_integer_ptype!(sizes_primitive.ptype(), |S| {
fill_referenced_mask::<O, S>(
&mut buf,
offsets_primitive.as_slice::<O>(),
sizes_primitive.as_slice::<S>(),
);
})
});
Comment thread
mhk197 marked this conversation as resolved.

Ok(Mask::from_buffer(buf.freeze()))
}

/// Exact fraction of `elements` referenced by some view, in `[0.0, 1.0]`. Extremely costly.
///
/// Returns `Ok(1.0)` when `elements` is empty instead of dividing by 0.
fn compute_density(&self, ctx: &mut ExecutionCtx) -> VortexResult<f32> {
if self.elements().is_empty() {
return Ok(1.0);
}

if self.sizes().is_empty() {
return Ok(0.0);
}

let density = match self.compute_referenced_elements_mask(ctx)? {
Mask::AllTrue(_) => 1.0,
Mask::AllFalse(_) => 0.0,
Mask::Values(values) => values.true_count() as f32 / self.elements().len() as f32,
};

Ok(density)
}

/// Upper-bound estimate of [`compute_density`](Self::compute_density) via
/// `sum(sizes) / elements.len()`, clamped to `[0.0, 1.0]`.
///
/// Exact for non-overlapping views, but overcounts when multiple views share the same elements.
///
/// Returns `Ok(1.0)` when `elements` is empty instead of dividing by 0.
fn upper_bound_density(&self, ctx: &mut ExecutionCtx) -> VortexResult<f32> {
let n_elts = self.elements().len();
if n_elts == 0 {
return Ok(1.0);
}

let sizes = self.sizes();
if sizes.is_empty() {
return Ok(0.0);
}

// compute_stat short-circuits on a cached exact Sum and otherwise computes
let sizes_sum = sizes
.statistics()
.compute_stat(Stat::Sum, ctx)?
.vortex_expect("sizes array has integer ptype elements")
.as_primitive()
.as_::<u64>()
.vortex_expect("integer ptypes can be upcast to u64");

// if the same elements are referenced more than once the estimate may be
// greater than 1.0, so clamp
let estimate = (sizes_sum as f32 / n_elts as f32).min(1.0);

debug_assert!(estimate >= 0.0);

Ok(estimate)
}
}
impl<T: TypedArrayRef<ListView>> ListViewArrayExt for T {}

Expand Down
12 changes: 0 additions & 12 deletions vortex-array/src/arrays/listview/compute/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,3 @@ mod mask;
pub(crate) mod rules;
mod slice;
mod take;

/// The threshold below which we rebuild the elements of a listview.
///
/// We don't touch `elements` on the metadata-only path since reorganizing it can be expensive.
/// However, we also don't want to drag around a large amount of garbage data when the selection
/// is sparse. Below this fraction of list rows retained, the rebuild is worth it.
/// Rebuilding is needed when exporting the ListView's elements.
///
// TODO(connor)[ListView]: Ideally, we would only rebuild after all `take`s and `filter`
// compute functions have run, at the "top" of the operator tree. However, we cannot do this
// right now, so we will just rebuild every time (similar to [`ListArray`]).
pub(crate) const REBUILD_DENSITY_THRESHOLD: f32 = 0.1;
29 changes: 1 addition & 28 deletions vortex-array/src/arrays/listview/compute/take.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
use num_traits::Zero;
use vortex_error::VortexResult;

use super::REBUILD_DENSITY_THRESHOLD;
use crate::ArrayRef;
use crate::ExecutionCtx;
use crate::IntoArray;
Expand All @@ -14,7 +13,6 @@ use crate::arrays::ListViewArray;
use crate::arrays::dict::TakeExecute;
use crate::arrays::dict::TakeReduce;
use crate::arrays::listview::ListViewArrayExt;
use crate::arrays::listview::ListViewRebuildMode;
use crate::builtins::ArrayBuiltins;
use crate::dtype::Nullability;
use crate::match_each_integer_ptype;
Expand All @@ -23,43 +21,18 @@ use crate::scalar::Scalar;
/// Metadata-only take for [`ListViewArray`].
impl TakeReduce for ListView {
fn take(array: ArrayView<'_, ListView>, indices: &ArrayRef) -> VortexResult<Option<ArrayRef>> {
// Approximate element density by the fraction of list rows retained. Assumes roughly
// uniform list sizes; good enough to decide whether dragging along the full `elements`
// buffer is worth avoiding a rebuild.
let kept_row_fraction = indices.len() as f32 / array.sizes().len() as f32;
if kept_row_fraction < REBUILD_DENSITY_THRESHOLD {
return Ok(None);
}

Ok(Some(apply_take(array, indices)?.into_array()))
}
}

/// Execution-path take for [`ListViewArray`].
///
/// This does the same metadata-only take as [`TakeReduce`], but also rebuilds the array if the
/// resulting array will be less dense than `REBUILD_DENSITY_THRESHOLD`.
impl TakeExecute for ListView {
fn take(
array: ArrayView<'_, ListView>,
indices: &ArrayRef,
_ctx: &mut ExecutionCtx,
) -> VortexResult<Option<ArrayRef>> {
let kept_row_fraction = indices.len() as f32 / array.sizes().len() as f32;
let taken = apply_take(array, indices)?;

if kept_row_fraction < REBUILD_DENSITY_THRESHOLD {
// TODO(connor)[ListView]: Ideally, we would only rebuild after all `take`s and `filter`
// compute functions have run, at the "top" of the operator tree. However, we cannot do
// this right now, so we will just rebuild every time (similar to `ListArray`).
Ok(Some(
taken
.rebuild(ListViewRebuildMode::MakeZeroCopyToList)?
.into_array(),
))
} else {
Ok(Some(taken.into_array()))
}
Ok(Some(apply_take(array, indices)?.into_array()))
}
}

Expand Down
1 change: 1 addition & 0 deletions vortex-array/src/arrays/listview/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ pub use conversion::list_view_from_list;
pub use conversion::recursive_list_from_list_view;

mod rebuild;
pub use rebuild::DEFAULT_REBUILD_DENSITY_THRESHOLD;
pub use rebuild::ListViewRebuildMode;

#[cfg(test)]
Expand Down
10 changes: 10 additions & 0 deletions vortex-array/src/arrays/listview/rebuild.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,16 @@ use crate::match_each_integer_ptype;
use crate::scalar::Scalar;
use crate::scalar_fn::fns::operators::Operator;

/// Density threshold to decide whether to rebuild a sparse `ListViewArray`.
///
/// A `ListViewArray` can accumulate unreferenced bytes in its `elements` buffer after
/// metadata-only operations like `take` and `filter`. When density (referenced fraction of `elements`)
/// falls below this threshold, the benefits of a rebuild may outweigh its cost.
///
/// This is a somewhat arbitrary rule-of-thumb and may be suboptimal depending on different use cases and
/// list element dtypes.
pub const DEFAULT_REBUILD_DENSITY_THRESHOLD: f32 = 0.1;

/// Modes for rebuilding a [`ListViewArray`].
pub enum ListViewRebuildMode {
/// Removes all unused data and flattens out all list data, such that the array is zero-copyable
Expand Down
20 changes: 20 additions & 0 deletions vortex-array/src/arrays/listview/tests/common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,15 @@ pub fn create_basic_listview() -> ListViewArray {
}
}

/// Creates a sparse ListView with two overlap regions
/// `[[0,1,2], [1,2], [18, 19], [19]]` over 20 elements.
pub fn create_sparse_overlapping_listview() -> ListViewArray {
let elements = buffer![0i32..20].into_array();
let offsets = buffer![0u32, 1, 18, 19].into_array();
let sizes = buffer![3u32, 2, 2, 1].into_array();
ListViewArray::new(elements, offsets, sizes, Validity::NonNullable)
}

/// Creates a nullable ListView: [[10,20], null, [50]]
pub fn create_nullable_listview() -> ListViewArray {
let elements = buffer![10i32, 20, 30, 40, 50].into_array();
Expand All @@ -45,6 +54,17 @@ pub fn create_empty_lists_listview() -> ListViewArray {
}
}

/// Creates a ListView with empty lists and elements: [[]]
pub fn create_empty_elements_listview() -> ListViewArray {
let elements = PrimitiveArray::from_iter::<[i32; 0]>([]).into_array();
let offsets = buffer![0u32; 0].into_array();
let sizes = buffer![0u32; 0].into_array();
unsafe {
ListViewArray::new_unchecked(elements, offsets, sizes, Validity::NonNullable)
.with_zero_copy_to_list(true)
}
}

/// Creates a ListView with overlapping lists and out-of-order offsets
/// Lists: [[5,6,7], [2,3], [8,9], [0,1], [1,2,3,4]]
pub fn create_overlapping_listview() -> ListViewArray {
Expand Down
Loading
Loading