From a97240ad24944b5195e338f6482074789cde327b Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 21 May 2026 11:30:08 +0000 Subject: [PATCH 1/8] Add cast-on-decompression pushdown for bit-packed arrays Widening a bit-packed narrow integer column to a wider type (e.g. u16 -> u32) currently has no cast pushdown: cast(bit_packed) canonicalizes to a full-length narrow PrimitiveArray and then casts it, allocating two full-length buffers and round-tripping the narrow intermediate through RAM. Add `BitUnpackedChunks::decode_cast_into`, which unpacks each 1024-element FastLanes chunk into the existing cache-resident scratch buffer and maps each value through a closure into a differently-typed output, plus `unpack_and_cast_into_builder` which uses it to unpack straight into a wide PrimitiveBuilder (handling validity and patches). Add a divan benchmark (cast_bitpacked) comparing the current canonicalize-then-cast path against the pushdown, over single and chunked arrays, with and without patches. Signed-off-by: "Joe Isaacs" --- encodings/fastlanes/Cargo.toml | 5 + encodings/fastlanes/benches/cast_bitpacked.rs | 137 ++++++++++++++++++ .../bitpacking/array/bitpack_decompress.rs | 137 ++++++++++++++++++ .../src/bitpacking/array/unpack_iter.rs | 44 ++++++ 4 files changed, 323 insertions(+) create mode 100644 encodings/fastlanes/benches/cast_bitpacked.rs diff --git a/encodings/fastlanes/Cargo.toml b/encodings/fastlanes/Cargo.toml index 7b4cb5da069..08c96c481d7 100644 --- a/encodings/fastlanes/Cargo.toml +++ b/encodings/fastlanes/Cargo.toml @@ -63,3 +63,8 @@ required-features = ["_test-harness"] [[bench]] name = "bitpack_compare" harness = false + +[[bench]] +name = "cast_bitpacked" +harness = false +required-features = ["_test-harness"] diff --git a/encodings/fastlanes/benches/cast_bitpacked.rs b/encodings/fastlanes/benches/cast_bitpacked.rs new file mode 100644 index 00000000000..b2d5a89b3a2 --- /dev/null +++ b/encodings/fastlanes/benches/cast_bitpacked.rs @@ -0,0 +1,137 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Benchmarks the cost of widening a bit-packed narrow integer column to a wider integer type on +//! decompression (e.g. `u16 -> u32`). +//! +//! Three strategies are compared: +//! +//! - `canonical_cast`: the current path. `cast(bit_packed)` has no pushdown, so it canonicalizes to +//! a full-length `u16` `PrimitiveArray` and then casts that to `u32`. Two full-length buffers are +//! allocated and the `u16` intermediate is written to RAM and read back. +//! - `chunked_canonical_cast`: the same, but over a `ChunkedArray` of bit-packed chunks, i.e. +//! `chunked(cast(bit_packed))`. This is the shape produced by a scan over a chunked column. +//! - `pushdown`: unpacks each 1024-element FastLanes chunk into a cache-resident scratch buffer and +//! cast-copies straight into the `u32` output. Only the `u32` output is allocated/touched in RAM. + +#![expect(clippy::unwrap_used)] +#![expect(clippy::cast_possible_truncation)] + +use std::sync::LazyLock; + +use divan::Bencher; +use rand::RngExt; +use rand::SeedableRng; +use rand::prelude::StdRng; +use vortex_array::ArrayRef; +use vortex_array::IntoArray; +use vortex_array::VortexSessionExecute; +use vortex_array::arrays::ChunkedArray; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::builders::PrimitiveBuilder; +use vortex_array::builtins::ArrayBuiltins; +use vortex_array::dtype::DType; +use vortex_array::dtype::Nullability; +use vortex_array::dtype::PType; +use vortex_array::session::ArraySession; +use vortex_array::validity::Validity; +use vortex_buffer::BufferMut; +use vortex_error::VortexExpect; +use vortex_fastlanes::BitPackedArray; +use vortex_fastlanes::BitPackedData; +use vortex_fastlanes::bitpack_decompress::unpack_and_cast_into_builder; +use vortex_session::VortexSession; + +fn main() { + divan::main(); +} + +static SESSION: LazyLock = + LazyLock::new(|| VortexSession::empty().with::()); + +const U32: DType = DType::Primitive(PType::U32, Nullability::NonNullable); + +// (chunk_len, chunk_count, fraction_patched) +const ARGS: &[(usize, usize, f64)] = &[ + (65_536, 1, 0.00), + (65_536, 1, 0.01), + (65_536, 16, 0.00), + (65_536, 16, 0.01), + (1_048_576, 1, 0.00), + (1_048_576, 1, 0.01), +]; + +/// Build a single bit-packed `u16` chunk. Most values fit in `bit_width` bits; `fraction_patched` +/// of them are large enough to require patches. +fn make_chunk(rng: &mut StdRng, len: usize, fraction_patched: f64) -> BitPackedArray { + let bit_width = 9u8; + let cap = 1u16 << bit_width; + let values = (0..len) + .map(|_| { + if rng.random_bool(fraction_patched) { + rng.random_range(cap..u16::MAX) + } else { + rng.random_range(0..cap) + } + }) + .collect::>(); + let array = PrimitiveArray::new(values, Validity::NonNullable); + BitPackedData::encode( + &array.into_array(), + bit_width, + &mut SESSION.create_execution_ctx(), + ) + .vortex_expect("encode") +} + +fn make_chunks(len: usize, count: usize, fraction_patched: f64) -> Vec { + let mut rng = StdRng::seed_from_u64(0); + (0..count) + .map(|_| make_chunk(&mut rng, len, fraction_patched)) + .collect() +} + +fn single(chunks: &[BitPackedArray]) -> ArrayRef { + if chunks.len() == 1 { + chunks[0].clone().into_array() + } else { + ChunkedArray::from_iter(chunks.iter().map(|c| c.clone().into_array())).into_array() + } +} + +#[cfg(not(codspeed))] +#[divan::bench(args = ARGS)] +fn canonical_cast(bencher: Bencher, (chunk_len, chunk_count, frac): (usize, usize, f64)) { + let chunks = make_chunks(chunk_len, chunk_count, frac); + bencher + .with_inputs(|| (single(&chunks), SESSION.create_execution_ctx())) + .bench_refs(|(array, ctx)| { + array + .clone() + .cast(U32) + .unwrap() + .execute::(ctx) + .unwrap() + }); +} + +#[cfg(not(codspeed))] +#[divan::bench(args = ARGS)] +fn pushdown(bencher: Bencher, (chunk_len, chunk_count, frac): (usize, usize, f64)) { + let chunks = make_chunks(chunk_len, chunk_count, frac); + let total = chunk_len * chunk_count; + bencher + .with_inputs(|| { + ( + chunks.clone(), + PrimitiveBuilder::::with_capacity(Nullability::NonNullable, total), + SESSION.create_execution_ctx(), + ) + }) + .bench_refs(|(chunks, builder, ctx)| { + for chunk in chunks.iter() { + unpack_and_cast_into_builder::(chunk.as_view(), builder, ctx).unwrap(); + } + builder.finish_into_primitive() + }); +} diff --git a/encodings/fastlanes/src/bitpacking/array/bitpack_decompress.rs b/encodings/fastlanes/src/bitpacking/array/bitpack_decompress.rs index 6570d26bbc9..ce1b887b8e8 100644 --- a/encodings/fastlanes/src/bitpacking/array/bitpack_decompress.rs +++ b/encodings/fastlanes/src/bitpacking/array/bitpack_decompress.rs @@ -79,6 +79,85 @@ pub(crate) fn unpack_into_primitive_builder( Ok(()) } +/// Unpack a bit-packed array of physical type `F` directly into a wider primitive type `T`, +/// casting each value during decompression. +/// +/// This is the "cast pushdown" path: rather than canonicalizing to a full-length `F`-typed +/// `PrimitiveArray` and then casting it to `T` (two full-length buffers, with the `F` intermediate +/// written out to RAM), we unpack each 1024-element FastLanes chunk into a small cache-resident +/// scratch buffer and cast-copy straight into the `T` output. Only the `T` output buffer is +/// allocated and touched in RAM. +/// +/// The caller must ensure all valid values are representable in `T` (it is intended for widening +/// casts such as `u16 -> u32`); narrowing or sign-changing casts are not validated here. +pub fn unpack_and_cast_into_builder( + array: ArrayView<'_, BitPacked>, + builder: &mut PrimitiveBuilder, + ctx: &mut ExecutionCtx, +) -> VortexResult<()> +where + F: BitPackedUnpack + AsPrimitive, + T: NativePType, +{ + if array.is_empty() { + return Ok(()); + } + + let len = array.len(); + let mut uninit_range = builder.uninit_range(len); + + // SAFETY: We initialize all `len` values below via `decode_cast_into` and the patch loop. + unsafe { + uninit_range.append_mask(array.validity()?.execute_mask(len, ctx)?); + } + + // SAFETY: `decode_cast_into` writes a value to every slot in this range. + let uninit_slice = unsafe { uninit_range.slice_uninit_mut(0, len) }; + + let mut chunks = array.unpacked_chunks::()?; + chunks.decode_cast_into(uninit_slice, |v: F| v.as_()); + + if let Some(patches) = array.patches() { + apply_cast_patches_to_uninit_range::(&mut uninit_range, &patches, ctx)?; + } + + // SAFETY: A correct validity mask of `len` values was set via `append_mask`, and the same + // number of values was initialized via `decode_cast_into` (and overwritten by patches). + unsafe { + uninit_range.finish(); + } + Ok(()) +} + +/// Like [`apply_patches_to_uninit_range`], but the stored patch values have physical type `F` and +/// are cast to the wider output type `T` before being written. +fn apply_cast_patches_to_uninit_range( + dst: &mut UninitRange, + patches: &Patches, + ctx: &mut ExecutionCtx, +) -> VortexResult<()> +where + F: NativePType + AsPrimitive, + T: NativePType, +{ + assert_eq!(patches.array_len(), dst.len()); + + let indices = patches.indices().clone().execute::(ctx)?; + let values = patches.values().clone().execute::(ctx)?; + assert!(values.all_valid(ctx)?, "Patch values must be all valid"); + let values = values.as_slice::(); + + match_each_unsigned_integer_ptype!(indices.ptype(), |P| { + for (index, &value) in indices.as_slice::

().iter().zip_eq(values) { + dst.set_value( +

>::as_(*index) - patches.offset(), + value.as_(), + ); + } + }); + Ok(()) +} + pub fn apply_patches_to_uninit_range( dst: &mut UninitRange, patches: &Patches, @@ -587,6 +666,64 @@ mod tests { Ok(()) } + /// The cast-pushdown path must produce identical values to canonicalize-then-cast. + #[test] + fn test_unpack_and_cast_matches_canonical_cast() -> VortexResult<()> { + use vortex_array::builtins::ArrayBuiltins; + use vortex_array::dtype::DType; + use vortex_array::dtype::PType; + + let mut ctx = SESSION.create_execution_ctx(); + + // Three full chunks plus a trailer, with values large enough to force patches at width 8. + let values: Vec = (0..3500u16) + .map(|i| if i % 137 == 0 { 5000 + i } else { i % 200 }) + .collect(); + let array = PrimitiveArray::from_iter(values.clone()); + let bitpacked = encode(&array, 8); + assert!(bitpacked.patches().is_some()); + + let u32_dtype = DType::Primitive(PType::U32, Nullability::NonNullable); + + // Reference: canonicalize (unpack to u16) then cast to u32. + let reference = bitpacked + .clone() + .into_array() + .cast(u32_dtype.clone())? + .execute::(&mut ctx)?; + + // Pushdown: unpack u16 bit-packed directly into a u32 builder. + let mut builder = + PrimitiveBuilder::::with_capacity(Nullability::NonNullable, array.len()); + unpack_and_cast_into_builder::(bitpacked.as_view(), &mut builder, &mut ctx)?; + let pushdown = builder.finish_into_primitive(); + + assert_arrays_eq!(pushdown, reference); + + // Verify a sliced array (offset > 0, trailer present). Use patch-free values so the slice + // reduces eagerly to a concrete `BitPacked` with an offset. + let dense = PrimitiveArray::from_iter((0..3000u16).map(|i| i % 200)); + let dense_bp = encode(&dense, 8); + assert!(dense_bp.patches().is_none()); + let sliced = dense_bp + .into_array() + .slice(500..2600)? + .try_downcast::() + .map_err(|_| vortex_error::vortex_err!("slice did not reduce to BitPacked"))?; + assert!(sliced.offset() > 0); + + let reference_sliced = sliced + .clone() + .into_array() + .cast(u32_dtype)? + .execute::(&mut ctx)?; + let mut builder = + PrimitiveBuilder::::with_capacity(Nullability::NonNullable, sliced.len()); + unpack_and_cast_into_builder::(sliced.as_view(), &mut builder, &mut ctx)?; + assert_arrays_eq!(builder.finish_into_primitive(), reference_sliced); + Ok(()) + } + /// Test edge cases for unpacking. #[test] fn test_unpack_edge_cases() -> VortexResult<()> { diff --git a/encodings/fastlanes/src/bitpacking/array/unpack_iter.rs b/encodings/fastlanes/src/bitpacking/array/unpack_iter.rs index 2f7187d26f1..db09f0e3cc0 100644 --- a/encodings/fastlanes/src/bitpacking/array/unpack_iter.rs +++ b/encodings/fastlanes/src/bitpacking/array/unpack_iter.rs @@ -104,6 +104,50 @@ impl BitUnpackedChunks { ) } + /// Decode all chunks (initial, full, and trailer), mapping each value through `f` and writing + /// the result into a differently-typed `output`. + /// + /// Unlike [`decode_into`](Self::decode_into), full chunks cannot be unpacked directly into the + /// output because the output element type `U` differs from the packed type `T`. Each chunk is + /// unpacked into the small internal scratch buffer (which stays resident in cache) and then + /// mapped value-by-value into `output`. This avoids materializing a full-length `T`-typed + /// intermediate buffer, which is the win for cast-on-decompression (e.g. bit-packed `u16` cast + /// to `u32`). + pub fn decode_cast_into(&mut self, output: &mut [MaybeUninit], f: impl Fn(T) -> U) { + let mut local_idx = 0; + + if let Some(initial) = self.initial() { + for (dst, &src) in output[..initial.len()].iter_mut().zip(initial.iter()) { + dst.write(f(src)); + } + local_idx += initial.len(); + } + + // `initial` already handled the only chunk when `num_chunks == 1`; mirror the guard in + // `decode_full_chunks_into_at` so we don't decode chunk 0 twice. + if self.num_chunks != 1 { + let mut chunks = self.full_chunks(); + while let Some(chunk) = chunks.next() { + for (dst, &src) in output[local_idx..][..CHUNK_SIZE] + .iter_mut() + .zip(chunk.iter()) + { + dst.write(f(src)); + } + local_idx += CHUNK_SIZE; + } + } + + if let Some(trailer) = self.trailer() { + for (dst, &src) in output[local_idx..][..trailer.len()] + .iter_mut() + .zip(trailer.iter()) + { + dst.write(f(src)); + } + } + } + pub fn full_chunks(&mut self) -> BitUnpackIterator<'_, T> { let elems_per_chunk = self.elems_per_chunk(); let last_chunk_is_sliced = self.last_chunk_is_sliced() as usize; From 267524d47339264c34dfe85e3add18901da798bd Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 21 May 2026 13:07:49 +0000 Subject: [PATCH 2/8] Wire bit-packed cast pushdown into CastKernel for widening int casts Extend BitPacked's CastKernel so that widening integer casts (e.g. u16 -> u32) dispatch to the unpack-and-cast pushdown automatically, instead of falling back to canonicalize-then-cast. The cast is gated to strictly wider integer targets where every bit-packable value is representable (unsigned source, or signed-to-signed), so no per-value bounds check is needed. Update the cast_bitpacked benchmark to measure the real array.cast(u32).execute() path alongside an explicit canonicalize-then-cast baseline and the direct helper. Signed-off-by: "Joe Isaacs" --- encodings/fastlanes/benches/cast_bitpacked.rs | 39 ++++++++++--- .../fastlanes/src/bitpacking/compute/cast.rs | 55 +++++++++++++++++-- 2 files changed, 79 insertions(+), 15 deletions(-) diff --git a/encodings/fastlanes/benches/cast_bitpacked.rs b/encodings/fastlanes/benches/cast_bitpacked.rs index b2d5a89b3a2..2450b021b17 100644 --- a/encodings/fastlanes/benches/cast_bitpacked.rs +++ b/encodings/fastlanes/benches/cast_bitpacked.rs @@ -6,13 +6,14 @@ //! //! Three strategies are compared: //! -//! - `canonical_cast`: the current path. `cast(bit_packed)` has no pushdown, so it canonicalizes to -//! a full-length `u16` `PrimitiveArray` and then casts that to `u32`. Two full-length buffers are -//! allocated and the `u16` intermediate is written to RAM and read back. -//! - `chunked_canonical_cast`: the same, but over a `ChunkedArray` of bit-packed chunks, i.e. -//! `chunked(cast(bit_packed))`. This is the shape produced by a scan over a chunked column. -//! - `pushdown`: unpacks each 1024-element FastLanes chunk into a cache-resident scratch buffer and -//! cast-copies straight into the `u32` output. Only the `u32` output is allocated/touched in RAM. +//! - `cast_execute`: the real public path, `array.cast(u32).execute()`. With the bit-packed cast +//! pushdown wired into `BitPacked`'s `CastKernel`, this unpacks-and-casts in a single pass. +//! - `canonicalize_then_cast`: explicitly canonicalizes to a full-length `u16` `PrimitiveArray` and +//! then casts that to `u32`. This reproduces the behaviour before the pushdown existed (two +//! full-length buffers, the `u16` intermediate written to RAM and read back, plus the generic +//! primitive cast kernel's bounds-check scan), and serves as the in-run baseline. +//! - `pushdown_helper`: calls the `unpack_and_cast_into_builder` helper directly. This is the floor +//! for the technique, and `cast_execute` should track it once the kernel is wired in. #![expect(clippy::unwrap_used)] #![expect(clippy::cast_possible_truncation)] @@ -99,9 +100,10 @@ fn single(chunks: &[BitPackedArray]) -> ArrayRef { } } +/// The real public path: `array.cast(u32).execute()`. Hits the bit-packed cast pushdown kernel. #[cfg(not(codspeed))] #[divan::bench(args = ARGS)] -fn canonical_cast(bencher: Bencher, (chunk_len, chunk_count, frac): (usize, usize, f64)) { +fn cast_execute(bencher: Bencher, (chunk_len, chunk_count, frac): (usize, usize, f64)) { let chunks = make_chunks(chunk_len, chunk_count, frac); bencher .with_inputs(|| (single(&chunks), SESSION.create_execution_ctx())) @@ -115,9 +117,28 @@ fn canonical_cast(bencher: Bencher, (chunk_len, chunk_count, frac): (usize, usiz }); } +/// Baseline: canonicalize to a full-length `u16` array, then cast that primitive array to `u32`. +/// Reproduces the pre-pushdown behaviour. #[cfg(not(codspeed))] #[divan::bench(args = ARGS)] -fn pushdown(bencher: Bencher, (chunk_len, chunk_count, frac): (usize, usize, f64)) { +fn canonicalize_then_cast(bencher: Bencher, (chunk_len, chunk_count, frac): (usize, usize, f64)) { + let chunks = make_chunks(chunk_len, chunk_count, frac); + bencher + .with_inputs(|| (single(&chunks), SESSION.create_execution_ctx())) + .bench_refs(|(array, ctx)| { + let canonical = array.clone().execute::(ctx).unwrap(); + canonical + .into_array() + .cast(U32) + .unwrap() + .execute::(ctx) + .unwrap() + }); +} + +#[cfg(not(codspeed))] +#[divan::bench(args = ARGS)] +fn pushdown_helper(bencher: Bencher, (chunk_len, chunk_count, frac): (usize, usize, f64)) { let chunks = make_chunks(chunk_len, chunk_count, frac); let total = chunk_len * chunk_count; bencher diff --git a/encodings/fastlanes/src/bitpacking/compute/cast.rs b/encodings/fastlanes/src/bitpacking/compute/cast.rs index 3cb810e0442..987918b23d3 100644 --- a/encodings/fastlanes/src/bitpacking/compute/cast.rs +++ b/encodings/fastlanes/src/bitpacking/compute/cast.rs @@ -5,8 +5,11 @@ use vortex_array::ArrayRef; use vortex_array::ArrayView; use vortex_array::ExecutionCtx; use vortex_array::IntoArray; +use vortex_array::builders::PrimitiveBuilder; use vortex_array::builtins::ArrayBuiltins; use vortex_array::dtype::DType; +use vortex_array::dtype::PType; +use vortex_array::match_each_integer_ptype; use vortex_array::scalar_fn::fns::cast::CastKernel; use vortex_array::scalar_fn::fns::cast::CastReduce; use vortex_array::validity::Validity; @@ -14,6 +17,19 @@ use vortex_error::VortexResult; use crate::bitpacking::BitPacked; use crate::bitpacking::array::BitPackedArrayExt; +use crate::bitpacking::array::bitpack_decompress::unpack_and_cast_into_builder; + +/// Returns `true` if casting `src` to `tgt` is a widening integer cast for which every value a +/// bit-packed array can hold is guaranteed to be representable in `tgt` (so no per-value bounds +/// check is needed). This holds when `tgt` is strictly wider and either the source is unsigned +/// (always non-negative, fits in any wider type) or the target is also signed (sign-extension). +fn is_widening_int_cast(src: PType, tgt: PType) -> bool { + src.is_int() + && tgt.is_int() + && tgt.byte_width() > src.byte_width() + && (src.is_unsigned_int() || tgt.is_signed_int()) +} + fn build_with_validity( array: ArrayView<'_, BitPacked>, @@ -56,14 +72,41 @@ impl CastKernel for BitPacked { dtype: &DType, ctx: &mut ExecutionCtx, ) -> VortexResult> { - if !array.dtype().eq_ignore_nullability(dtype) { + // Nullability-only change: keep the values bit-packed, just adjust validity. + if array.dtype().eq_ignore_nullability(dtype) { + let new_validity = + array + .validity()? + .cast_nullability(dtype.nullability(), array.len(), ctx)?; + return build_with_validity(array, dtype, new_validity).map(Some); + } + + // Widening integer cast: unpack each FastLanes chunk into a cache-resident scratch buffer + // and cast-copy straight into the wide output, avoiding a full-length intermediate buffer + // and the generic cast kernel's bounds-check scan (unnecessary when widening). + let DType::Primitive(tgt, tgt_nullability) = dtype else { + return Ok(None); + }; + let (tgt, tgt_nullability) = (*tgt, *tgt_nullability); + let src = array.dtype().as_ptype(); + if !is_widening_int_cast(src, tgt) { return Ok(None); } - let new_validity = - array - .validity()? - .cast_nullability(dtype.nullability(), array.len(), ctx)?; - build_with_validity(array, dtype, new_validity).map(Some) + + // Surface the standard error if a nullable source with nulls is cast to a non-nullable + // type; on success the per-value validity is handled inside the unpack below. + array + .validity()? + .cast_nullability(tgt_nullability, array.len(), ctx)?; + + let result = match_each_integer_ptype!(tgt, |T| { + let mut builder = PrimitiveBuilder::::with_capacity(tgt_nullability, array.len()); + match_each_integer_ptype!(src, |F| { + unpack_and_cast_into_builder::(array, &mut builder, ctx)?; + }); + builder.finish_into_primitive().into_array() + }); + Ok(Some(result)) } } From 2010fd36c09396a88567ffbc87720355a3df37af Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 21 May 2026 13:12:45 +0000 Subject: [PATCH 3/8] Fix clippy lints and refresh public-api.lock for cast pushdown Signed-off-by: "Joe Isaacs" --- encodings/fastlanes/benches/cast_bitpacked.rs | 1 - encodings/fastlanes/public-api.lock | 4 ++++ .../fastlanes/src/bitpacking/array/bitpack_decompress.rs | 6 ++---- encodings/fastlanes/src/bitpacking/compute/cast.rs | 1 - 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/encodings/fastlanes/benches/cast_bitpacked.rs b/encodings/fastlanes/benches/cast_bitpacked.rs index 2450b021b17..d2e1b08ac3f 100644 --- a/encodings/fastlanes/benches/cast_bitpacked.rs +++ b/encodings/fastlanes/benches/cast_bitpacked.rs @@ -16,7 +16,6 @@ //! for the technique, and `cast_execute` should track it once the kernel is wired in. #![expect(clippy::unwrap_used)] -#![expect(clippy::cast_possible_truncation)] use std::sync::LazyLock; diff --git a/encodings/fastlanes/public-api.lock b/encodings/fastlanes/public-api.lock index 4f0ce3df18c..1ede93c12d7 100644 --- a/encodings/fastlanes/public-api.lock +++ b/encodings/fastlanes/public-api.lock @@ -40,6 +40,8 @@ pub fn vortex_fastlanes::bitpack_decompress::apply_patches_to_uninit_range_fn usize +pub fn vortex_fastlanes::bitpack_decompress::unpack_and_cast_into_builder(vortex_array::array::view::ArrayView<'_, vortex_fastlanes::BitPacked>, &mut vortex_array::builders::primitive::PrimitiveBuilder, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<()> where F: vortex_fastlanes::unpack_iter::BitPacked + num_traits::cast::AsPrimitive, T: vortex_array::dtype::ptype::NativePType + pub fn vortex_fastlanes::bitpack_decompress::unpack_array(vortex_array::array::view::ArrayView<'_, vortex_fastlanes::BitPacked>, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult pub fn vortex_fastlanes::bitpack_decompress::unpack_primitive_array(vortex_array::array::view::ArrayView<'_, vortex_fastlanes::BitPacked>, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult @@ -84,6 +86,8 @@ pub fn vortex_fastlanes::unpack_iter::UnpackedChunks::try_new_with_strateg impl vortex_fastlanes::unpack_iter::UnpackedChunks +pub fn vortex_fastlanes::unpack_iter::UnpackedChunks::decode_cast_into(&mut self, &mut [core::mem::maybe_uninit::MaybeUninit], impl core::ops::function::Fn(T) -> U) + pub fn vortex_fastlanes::unpack_iter::UnpackedChunks::full_chunks(&mut self) -> vortex_fastlanes::unpack_iter::BitUnpackIterator<'_, T> pub fn vortex_fastlanes::unpack_iter::UnpackedChunks::try_new(&vortex_fastlanes::BitPackedData, usize) -> vortex_error::VortexResult diff --git a/encodings/fastlanes/src/bitpacking/array/bitpack_decompress.rs b/encodings/fastlanes/src/bitpacking/array/bitpack_decompress.rs index ce1b887b8e8..65d2d147927 100644 --- a/encodings/fastlanes/src/bitpacking/array/bitpack_decompress.rs +++ b/encodings/fastlanes/src/bitpacking/array/bitpack_decompress.rs @@ -676,10 +676,8 @@ mod tests { let mut ctx = SESSION.create_execution_ctx(); // Three full chunks plus a trailer, with values large enough to force patches at width 8. - let values: Vec = (0..3500u16) - .map(|i| if i % 137 == 0 { 5000 + i } else { i % 200 }) - .collect(); - let array = PrimitiveArray::from_iter(values.clone()); + let values = (0..3500u16).map(|i| if i % 137 == 0 { 5000 + i } else { i % 200 }); + let array = PrimitiveArray::from_iter(values); let bitpacked = encode(&array, 8); assert!(bitpacked.patches().is_some()); diff --git a/encodings/fastlanes/src/bitpacking/compute/cast.rs b/encodings/fastlanes/src/bitpacking/compute/cast.rs index 987918b23d3..9a355f822fa 100644 --- a/encodings/fastlanes/src/bitpacking/compute/cast.rs +++ b/encodings/fastlanes/src/bitpacking/compute/cast.rs @@ -30,7 +30,6 @@ fn is_widening_int_cast(src: PType, tgt: PType) -> bool { && (src.is_unsigned_int() || tgt.is_signed_int()) } - fn build_with_validity( array: ArrayView<'_, BitPacked>, dtype: &DType, From b0d1f54ff48d7d448b35df0447c9f7e44b040276 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Thu, 21 May 2026 15:39:42 +0100 Subject: [PATCH 4/8] u Signed-off-by: Joe Isaacs --- .../fastlanes/src/bitpacking/array/bitpack_decompress.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/encodings/fastlanes/src/bitpacking/array/bitpack_decompress.rs b/encodings/fastlanes/src/bitpacking/array/bitpack_decompress.rs index 65d2d147927..95da6393897 100644 --- a/encodings/fastlanes/src/bitpacking/array/bitpack_decompress.rs +++ b/encodings/fastlanes/src/bitpacking/array/bitpack_decompress.rs @@ -687,6 +687,8 @@ mod tests { let reference = bitpacked .clone() .into_array() + .execute::(&mut ctx)? + .into_array() .cast(u32_dtype.clone())? .execute::(&mut ctx)?; @@ -713,6 +715,8 @@ mod tests { let reference_sliced = sliced .clone() .into_array() + .execute::(&mut ctx)? + .into_array() .cast(u32_dtype)? .execute::(&mut ctx)?; let mut builder = From 921b04b756e9ab9962b45cb52c44403b21fb91e0 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Thu, 21 May 2026 16:35:46 +0100 Subject: [PATCH 5/8] u Signed-off-by: Joe Isaacs --- .../bitpacking/array/bitpack_decompress.rs | 6 +- .../src/bitpacking/array/unpack_iter.rs | 166 ++++++++---------- .../fastlanes/src/bitpacking/compute/cast.rs | 64 +++++++ 3 files changed, 137 insertions(+), 99 deletions(-) diff --git a/encodings/fastlanes/src/bitpacking/array/bitpack_decompress.rs b/encodings/fastlanes/src/bitpacking/array/bitpack_decompress.rs index 95da6393897..69de49a323d 100644 --- a/encodings/fastlanes/src/bitpacking/array/bitpack_decompress.rs +++ b/encodings/fastlanes/src/bitpacking/array/bitpack_decompress.rs @@ -106,16 +106,16 @@ where let len = array.len(); let mut uninit_range = builder.uninit_range(len); - // SAFETY: We initialize all `len` values below via `decode_cast_into` and the patch loop. + // SAFETY: We initialize all `len` values below via `decode_map_into` and the patch loop. unsafe { uninit_range.append_mask(array.validity()?.execute_mask(len, ctx)?); } - // SAFETY: `decode_cast_into` writes a value to every slot in this range. + // SAFETY: `decode_map_into` writes a value to every slot in this range. let uninit_slice = unsafe { uninit_range.slice_uninit_mut(0, len) }; let mut chunks = array.unpacked_chunks::()?; - chunks.decode_cast_into(uninit_slice, |v: F| v.as_()); + chunks.decode_map_into(uninit_slice, |v: F| v.as_()); if let Some(patches) = array.patches() { apply_cast_patches_to_uninit_range::(&mut uninit_range, &patches, ctx)?; diff --git a/encodings/fastlanes/src/bitpacking/array/unpack_iter.rs b/encodings/fastlanes/src/bitpacking/array/unpack_iter.rs index db09f0e3cc0..4a863653159 100644 --- a/encodings/fastlanes/src/bitpacking/array/unpack_iter.rs +++ b/encodings/fastlanes/src/bitpacking/array/unpack_iter.rs @@ -104,50 +104,6 @@ impl BitUnpackedChunks { ) } - /// Decode all chunks (initial, full, and trailer), mapping each value through `f` and writing - /// the result into a differently-typed `output`. - /// - /// Unlike [`decode_into`](Self::decode_into), full chunks cannot be unpacked directly into the - /// output because the output element type `U` differs from the packed type `T`. Each chunk is - /// unpacked into the small internal scratch buffer (which stays resident in cache) and then - /// mapped value-by-value into `output`. This avoids materializing a full-length `T`-typed - /// intermediate buffer, which is the win for cast-on-decompression (e.g. bit-packed `u16` cast - /// to `u32`). - pub fn decode_cast_into(&mut self, output: &mut [MaybeUninit], f: impl Fn(T) -> U) { - let mut local_idx = 0; - - if let Some(initial) = self.initial() { - for (dst, &src) in output[..initial.len()].iter_mut().zip(initial.iter()) { - dst.write(f(src)); - } - local_idx += initial.len(); - } - - // `initial` already handled the only chunk when `num_chunks == 1`; mirror the guard in - // `decode_full_chunks_into_at` so we don't decode chunk 0 twice. - if self.num_chunks != 1 { - let mut chunks = self.full_chunks(); - while let Some(chunk) = chunks.next() { - for (dst, &src) in output[local_idx..][..CHUNK_SIZE] - .iter_mut() - .zip(chunk.iter()) - { - dst.write(f(src)); - } - local_idx += CHUNK_SIZE; - } - } - - if let Some(trailer) = self.trailer() { - for (dst, &src) in output[local_idx..][..trailer.len()] - .iter_mut() - .zip(trailer.iter()) - { - dst.write(f(src)); - } - } - } - pub fn full_chunks(&mut self) -> BitUnpackIterator<'_, T> { let elems_per_chunk = self.elems_per_chunk(); let last_chunk_is_sliced = self.last_chunk_is_sliced() as usize; @@ -161,6 +117,15 @@ impl BitUnpackedChunks { first_chunk_is_sliced, ) } + + /// Decode all chunks (initial, full, and trailer), mapping each value through `f` and writing + /// the result into a differently-typed `output`. + /// + /// Kept as a cast-oriented alias for callers that want the old name. Internal code can call + /// `decode_map_into` directly. + pub fn decode_cast_into(&mut self, output: &mut [MaybeUninit], f: impl Fn(T) -> U) { + self.decode_map_into(output, f); + } } impl> UnpackedChunks { @@ -225,70 +190,79 @@ impl> UnpackedChunks { }) } - /// Decode all chunks (initial, full, and trailer) into the output range. - /// This consolidates the logic for handling all three chunk types in one place. - pub fn decode_into(&mut self, output: &mut [MaybeUninit]) { + /// Decode all chunks (initial, full, and trailer), calling `write_chunk` for each concrete + /// unpacked chunk and its corresponding output range. + pub(crate) fn decode_chunks_into( + &mut self, + output: &mut [MaybeUninit], + mut write_chunk: impl FnMut(&[T], &mut [MaybeUninit]), + ) { + debug_assert_eq!(output.len(), self.len); let mut local_idx = 0; - // Handle initial partial chunk if present if let Some(initial) = self.initial() { - local_idx = initial.len(); - - // TODO(connor): use `maybe_uninit_write_slice` feature when it gets stabilized. - // https://github.com/rust-lang/rust/issues/79995 - // SAFETY: &[T] and &[MaybeUninit] have the same layout. - let init_initial: &[MaybeUninit] = unsafe { mem::transmute(initial) }; - output[..local_idx].copy_from_slice(init_initial); + let chunk_len = initial.len(); + write_chunk(initial, &mut output[..chunk_len]); + local_idx += chunk_len; } - // Handle full chunks - local_idx = self.decode_full_chunks_into_at(output, local_idx); + if self.num_chunks != 1 { + let first_chunk_is_sliced = self.first_chunk_is_sliced(); + let last_chunk_is_sliced = self.last_chunk_is_sliced(); + let full_chunks_range = + (first_chunk_is_sliced as usize)..(self.num_chunks - last_chunk_is_sliced as usize); + + let packed_slice: &[T::Physical] = buffer_as_slice(&self.packed); + let elems_per_chunk = self.elems_per_chunk(); + for i in full_chunks_range { + let chunk = &packed_slice[i * elems_per_chunk..][..elems_per_chunk]; + + unsafe { + let dst: &mut [MaybeUninit] = &mut self.buffer; + // SAFETY: &[T] and &[MaybeUninit] have the same layout. + let dst: &mut [T::Physical] = mem::transmute(dst); + self.strategy.unpack_chunk(self.bit_width, chunk, dst); + } - // Handle trailing partial chunk if present - if let Some(trailer) = self.trailer() { - // TODO(connor): use `maybe_uninit_write_slice` feature when it gets stabilized. - // https://github.com/rust-lang/rust/issues/79995 - // SAFETY: &[T] and &[MaybeUninit] have the same layout. - let init_trailer: &[MaybeUninit] = unsafe { mem::transmute(trailer) }; - output[local_idx..][..init_trailer.len()].copy_from_slice(init_trailer); + // SAFETY: `unpack_chunk` initialized the whole scratch chunk above. + let unpacked: &[T] = unsafe { mem::transmute(&self.buffer[..]) }; + write_chunk(unpacked, &mut output[local_idx..local_idx + CHUNK_SIZE]); + local_idx += CHUNK_SIZE; + } } - } - /// Unpack full chunks into output range starting at the given index. - /// Returns the next local index to write to. - fn decode_full_chunks_into_at( - &mut self, - output: &mut [MaybeUninit], - start_idx: usize, - ) -> usize { - // If there's only one chunk it has been handled already by `initial` method - if self.num_chunks == 1 { - // Return the start_idx since initial already wrote everything. - return start_idx; + if let Some(trailer) = self.trailer() { + let chunk_len = trailer.len(); + write_chunk(trailer, &mut output[local_idx..local_idx + chunk_len]); + local_idx += chunk_len; } - let first_chunk_is_sliced = self.first_chunk_is_sliced(); - - let last_chunk_is_sliced = self.last_chunk_is_sliced(); - let full_chunks_range = - (first_chunk_is_sliced as usize)..(self.num_chunks - last_chunk_is_sliced as usize); - - let mut local_idx = start_idx; - - let packed_slice: &[T::Physical] = buffer_as_slice(&self.packed); - let elems_per_chunk = self.elems_per_chunk(); - for i in full_chunks_range { - let chunk = &packed_slice[i * elems_per_chunk..][..elems_per_chunk]; + debug_assert_eq!(local_idx, self.len); + } - unsafe { - let uninit_dst = &mut output[local_idx..local_idx + CHUNK_SIZE]; - // SAFETY: &[T] and &[MaybeUninit] have the same layout - let dst: &mut [T::Physical] = mem::transmute(uninit_dst); - self.strategy.unpack_chunk(self.bit_width, chunk, dst); + /// Decode all chunks (initial, full, and trailer), mapping each unpacked value through `f`. + pub(crate) fn decode_map_into( + &mut self, + output: &mut [MaybeUninit], + mut f: impl FnMut(T) -> U, + ) { + self.decode_chunks_into(output, |chunk, dst| { + for (dst, &src) in dst.iter_mut().zip(chunk.iter()) { + dst.write(f(src)); } - local_idx += CHUNK_SIZE; - } - local_idx + }); + } + + /// Decode all chunks (initial, full, and trailer) into the output range. + /// This is the identity mapping of [`decode_map_into`](Self::decode_map_into). + pub fn decode_into(&mut self, output: &mut [MaybeUninit]) { + self.decode_chunks_into(output, |chunk, dst| { + // TODO(connor): use `maybe_uninit_write_slice` feature when it gets stabilized. + // https://github.com/rust-lang/rust/issues/79995 + // SAFETY: &[T] and &[MaybeUninit] have the same layout. + let initialized: &[MaybeUninit] = unsafe { mem::transmute(chunk) }; + dst.copy_from_slice(initialized); + }); } /// Access last chunk of the array if the last chunk has fewer than 1024 due to slicing diff --git a/encodings/fastlanes/src/bitpacking/compute/cast.rs b/encodings/fastlanes/src/bitpacking/compute/cast.rs index 9a355f822fa..0892227919f 100644 --- a/encodings/fastlanes/src/bitpacking/compute/cast.rs +++ b/encodings/fastlanes/src/bitpacking/compute/cast.rs @@ -121,10 +121,15 @@ mod tests { use vortex_array::builtins::ArrayBuiltins; use vortex_array::compute::conformance::cast::test_cast_conformance; use vortex_array::dtype::DType; + use vortex_array::dtype::NativePType; use vortex_array::dtype::Nullability; use vortex_array::dtype::PType; + use vortex_array::match_each_integer_ptype; + use vortex_array::scalar_fn::fns::cast::CastKernel; use vortex_buffer::buffer; + use vortex_error::VortexResult; + use crate::BitPacked; use crate::BitPackedArray; use crate::BitPackedData; @@ -166,6 +171,65 @@ mod tests { ); } + #[test] + fn test_cast_bitpacked_widening_integer_matrix() -> VortexResult<()> { + fn values(len: usize) -> PrimitiveArray { + PrimitiveArray::from_iter((0..len).map(|i| { + let value = if i % 17 == 0 { 31 } else { i % 8 }; + ::from_usize(value) + .expect("test values fit every integer ptype") + })) + } + + fn supported(src: PType, tgt: PType) -> bool { + src.is_int() + && tgt.is_int() + && tgt.byte_width() > src.byte_width() + && (src.is_unsigned_int() || tgt.is_signed_int()) + } + + let ptypes = [ + PType::I8, + PType::I16, + PType::I32, + PType::I64, + PType::U8, + PType::U16, + PType::U32, + PType::U64, + ]; + let lengths = [0, 1, 7, 1023, 1024, 1025, 2051]; + + for src in ptypes { + for tgt in ptypes { + if !supported(src, tgt) { + continue; + } + + for len in lengths { + let source = match_each_integer_ptype!(src, |S| { values::(len) }); + let source_ref = source.clone().into_array(); + let packed = bp(&source_ref, 3); + let target = DType::Primitive(tgt, Nullability::NonNullable); + + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let casted = + ::cast(packed.as_view(), &target, &mut ctx)? + .expect( + "supported widening integer cast should hit BitPacked CastKernel", + ); + let reference = source_ref + .cast(target)? + .execute::(&mut ctx)?; + + assert_arrays_eq!(casted, reference); + } + } + } + + Ok(()) + } + #[rstest] #[case(bp(&buffer![0u8, 10, 20, 30, 40, 50, 60, 63].into_array(), 6))] #[case(bp(&buffer![0u16, 100, 200, 300, 400, 500].into_array(), 9))] From 69d72709e9aade855f6e171b67a5fb1cfef10f10 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Thu, 21 May 2026 17:12:13 +0100 Subject: [PATCH 6/8] u Signed-off-by: Joe Isaacs --- .../src/bitpacking/array/unpack_iter.rs | 96 ++++++++++++++----- 1 file changed, 72 insertions(+), 24 deletions(-) diff --git a/encodings/fastlanes/src/bitpacking/array/unpack_iter.rs b/encodings/fastlanes/src/bitpacking/array/unpack_iter.rs index 4a863653159..08e44051a5c 100644 --- a/encodings/fastlanes/src/bitpacking/array/unpack_iter.rs +++ b/encodings/fastlanes/src/bitpacking/array/unpack_iter.rs @@ -93,6 +93,60 @@ pub struct UnpackedChunks> { pub type BitUnpackedChunks = UnpackedChunks; +enum DecodeChunk<'a, T: PhysicalPType, S: UnpackStrategy> { + Unpacked(&'a [T]), + PackedFull { + strategy: &'a S, + bit_width: usize, + chunk: &'a [T::Physical], + scratch: &'a mut [MaybeUninit; CHUNK_SIZE], + }, +} + +impl<'a, T: PhysicalPType, S: UnpackStrategy> DecodeChunk<'a, T, S> { + fn unpacked(self) -> &'a [T] { + match self { + Self::Unpacked(chunk) => chunk, + Self::PackedFull { + strategy, + bit_width, + chunk, + scratch, + } => unsafe { + // SAFETY: `scratch` is exactly one FastLanes chunk and `chunk` contains the + // corresponding packed words. + let dst: &mut [T::Physical] = mem::transmute(&mut scratch[..]); + strategy.unpack_chunk(bit_width, chunk, dst); + // SAFETY: `unpack_chunk` initialized the whole scratch chunk above. + mem::transmute::<&[MaybeUninit], &[T]>(&scratch[..]) + }, + } + } + + fn write_identity_to(self, dst: &mut [MaybeUninit]) { + match self { + Self::Unpacked(chunk) => { + // TODO(connor): use `maybe_uninit_write_slice` feature when it gets stabilized. + // https://github.com/rust-lang/rust/issues/79995 + // SAFETY: &[T] and &[MaybeUninit] have the same layout. + let initialized: &[MaybeUninit] = unsafe { mem::transmute(chunk) }; + dst.copy_from_slice(initialized); + } + Self::PackedFull { + strategy, + bit_width, + chunk, + .. + } => unsafe { + // SAFETY: `dst` is exactly one FastLanes chunk and `chunk` contains the + // corresponding packed words. + let dst: &mut [T::Physical] = mem::transmute(dst); + strategy.unpack_chunk(bit_width, chunk, dst); + }, + } + } +} + impl BitUnpackedChunks { pub fn try_new(array: &BitPackedData, len: usize) -> VortexResult { Self::try_new_with_strategy( @@ -190,19 +244,17 @@ impl> UnpackedChunks { }) } - /// Decode all chunks (initial, full, and trailer), calling `write_chunk` for each concrete - /// unpacked chunk and its corresponding output range. - pub(crate) fn decode_chunks_into( + fn decode_chunks_into( &mut self, output: &mut [MaybeUninit], - mut write_chunk: impl FnMut(&[T], &mut [MaybeUninit]), + mut write_chunk: impl FnMut(DecodeChunk<'_, T, S>, &mut [MaybeUninit]), ) { debug_assert_eq!(output.len(), self.len); let mut local_idx = 0; if let Some(initial) = self.initial() { let chunk_len = initial.len(); - write_chunk(initial, &mut output[..chunk_len]); + write_chunk(DecodeChunk::Unpacked(initial), &mut output[..chunk_len]); local_idx += chunk_len; } @@ -217,23 +269,25 @@ impl> UnpackedChunks { for i in full_chunks_range { let chunk = &packed_slice[i * elems_per_chunk..][..elems_per_chunk]; - unsafe { - let dst: &mut [MaybeUninit] = &mut self.buffer; - // SAFETY: &[T] and &[MaybeUninit] have the same layout. - let dst: &mut [T::Physical] = mem::transmute(dst); - self.strategy.unpack_chunk(self.bit_width, chunk, dst); - } - - // SAFETY: `unpack_chunk` initialized the whole scratch chunk above. - let unpacked: &[T] = unsafe { mem::transmute(&self.buffer[..]) }; - write_chunk(unpacked, &mut output[local_idx..local_idx + CHUNK_SIZE]); + write_chunk( + DecodeChunk::PackedFull { + strategy: &self.strategy, + bit_width: self.bit_width, + chunk, + scratch: &mut self.buffer, + }, + &mut output[local_idx..local_idx + CHUNK_SIZE], + ); local_idx += CHUNK_SIZE; } } if let Some(trailer) = self.trailer() { let chunk_len = trailer.len(); - write_chunk(trailer, &mut output[local_idx..local_idx + chunk_len]); + write_chunk( + DecodeChunk::Unpacked(trailer), + &mut output[local_idx..local_idx + chunk_len], + ); local_idx += chunk_len; } @@ -247,6 +301,7 @@ impl> UnpackedChunks { mut f: impl FnMut(T) -> U, ) { self.decode_chunks_into(output, |chunk, dst| { + let chunk = chunk.unpacked(); for (dst, &src) in dst.iter_mut().zip(chunk.iter()) { dst.write(f(src)); } @@ -254,15 +309,8 @@ impl> UnpackedChunks { } /// Decode all chunks (initial, full, and trailer) into the output range. - /// This is the identity mapping of [`decode_map_into`](Self::decode_map_into). pub fn decode_into(&mut self, output: &mut [MaybeUninit]) { - self.decode_chunks_into(output, |chunk, dst| { - // TODO(connor): use `maybe_uninit_write_slice` feature when it gets stabilized. - // https://github.com/rust-lang/rust/issues/79995 - // SAFETY: &[T] and &[MaybeUninit] have the same layout. - let initialized: &[MaybeUninit] = unsafe { mem::transmute(chunk) }; - dst.copy_from_slice(initialized); - }); + self.decode_chunks_into(output, |chunk, dst| chunk.write_identity_to(dst)); } /// Access last chunk of the array if the last chunk has fewer than 1024 due to slicing From dde594995459575823c2729f080827fbc391e237 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 21 May 2026 19:06:25 +0000 Subject: [PATCH 7/8] Deduplicate patch application in bit-packed cast pushdown Generalize apply_patches_to_uninit_range_fn to a cross-type Fn(S) -> T so the cast pushdown reuses it instead of a near-identical copy, and drop the redundant identity wrapper. Behaviour and performance are unchanged. Signed-off-by: "Joe Isaacs" --- encodings/fastlanes/public-api.lock | 4 +- .../bitpacking/array/bitpack_decompress.rs | 49 +++---------------- .../fastlanes/src/for/array/for_decompress.rs | 2 +- 3 files changed, 9 insertions(+), 46 deletions(-) diff --git a/encodings/fastlanes/public-api.lock b/encodings/fastlanes/public-api.lock index 1ede93c12d7..3c3e5cab146 100644 --- a/encodings/fastlanes/public-api.lock +++ b/encodings/fastlanes/public-api.lock @@ -34,9 +34,7 @@ pub fn vortex_fastlanes::bitpack_compress::gather_patches(&vortex_array::arrays: pub mod vortex_fastlanes::bitpack_decompress -pub fn vortex_fastlanes::bitpack_decompress::apply_patches_to_uninit_range(&mut vortex_array::builders::primitive::UninitRange<'_, T>, &vortex_array::patches::Patches, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<()> - -pub fn vortex_fastlanes::bitpack_decompress::apply_patches_to_uninit_range_fn T>(&mut vortex_array::builders::primitive::UninitRange<'_, T>, &vortex_array::patches::Patches, &mut vortex_array::executor::ExecutionCtx, F) -> vortex_error::VortexResult<()> +pub fn vortex_fastlanes::bitpack_decompress::apply_patches_to_uninit_range_fn T>(&mut vortex_array::builders::primitive::UninitRange<'_, T>, &vortex_array::patches::Patches, &mut vortex_array::executor::ExecutionCtx, Fun) -> vortex_error::VortexResult<()> pub fn vortex_fastlanes::bitpack_decompress::count_exceptions(u8, &[usize]) -> usize diff --git a/encodings/fastlanes/src/bitpacking/array/bitpack_decompress.rs b/encodings/fastlanes/src/bitpacking/array/bitpack_decompress.rs index 69de49a323d..47bcd1120fd 100644 --- a/encodings/fastlanes/src/bitpacking/array/bitpack_decompress.rs +++ b/encodings/fastlanes/src/bitpacking/array/bitpack_decompress.rs @@ -68,7 +68,7 @@ pub(crate) fn unpack_into_primitive_builder( bit_packed_iter.decode_into(uninit_slice); if let Some(patches) = array.patches() { - apply_patches_to_uninit_range(&mut uninit_range, &patches, ctx)?; + apply_patches_to_uninit_range_fn(&mut uninit_range, &patches, ctx, |v: T| v)?; }; // SAFETY: We have set a correct validity mask via `append_mask` with `array.len()` values and @@ -118,7 +118,7 @@ where chunks.decode_map_into(uninit_slice, |v: F| v.as_()); if let Some(patches) = array.patches() { - apply_cast_patches_to_uninit_range::(&mut uninit_range, &patches, ctx)?; + apply_patches_to_uninit_range_fn(&mut uninit_range, &patches, ctx, |v: F| v.as_())?; } // SAFETY: A correct validity mask of `len` values was set via `append_mask`, and the same @@ -129,55 +129,20 @@ where Ok(()) } -/// Like [`apply_patches_to_uninit_range`], but the stored patch values have physical type `F` and -/// are cast to the wider output type `T` before being written. -fn apply_cast_patches_to_uninit_range( +/// Applies the patches to the uninitialized range, casting each stored patch value of physical +/// type `S` to the output type `T` via the identity-or-widening map `f`. +pub fn apply_patches_to_uninit_range_fn T>( dst: &mut UninitRange, patches: &Patches, ctx: &mut ExecutionCtx, -) -> VortexResult<()> -where - F: NativePType + AsPrimitive, - T: NativePType, -{ - assert_eq!(patches.array_len(), dst.len()); - - let indices = patches.indices().clone().execute::(ctx)?; - let values = patches.values().clone().execute::(ctx)?; - assert!(values.all_valid(ctx)?, "Patch values must be all valid"); - let values = values.as_slice::(); - - match_each_unsigned_integer_ptype!(indices.ptype(), |P| { - for (index, &value) in indices.as_slice::

().iter().zip_eq(values) { - dst.set_value( -

>::as_(*index) - patches.offset(), - value.as_(), - ); - } - }); - Ok(()) -} - -pub fn apply_patches_to_uninit_range( - dst: &mut UninitRange, - patches: &Patches, - ctx: &mut ExecutionCtx, -) -> VortexResult<()> { - apply_patches_to_uninit_range_fn(dst, patches, ctx, |x| x) -} - -pub fn apply_patches_to_uninit_range_fn T>( - dst: &mut UninitRange, - patches: &Patches, - ctx: &mut ExecutionCtx, - f: F, + f: Fun, ) -> VortexResult<()> { assert_eq!(patches.array_len(), dst.len()); let indices = patches.indices().clone().execute::(ctx)?; let values = patches.values().clone().execute::(ctx)?; assert!(values.all_valid(ctx)?, "Patch values must be all valid"); - let values = values.as_slice::(); + let values = values.as_slice::(); match_each_unsigned_integer_ptype!(indices.ptype(), |P| { for (index, &value) in indices.as_slice::

().iter().zip_eq(values) { diff --git a/encodings/fastlanes/src/for/array/for_decompress.rs b/encodings/fastlanes/src/for/array/for_decompress.rs index a26d6e9053e..6b43ac85c24 100644 --- a/encodings/fastlanes/src/for/array/for_decompress.rs +++ b/encodings/fastlanes/src/for/array/for_decompress.rs @@ -123,7 +123,7 @@ pub(crate) fn fused_decompress< &mut uninit_range, &patches, ctx, - |v| v.wrapping_add(&ref_), + |v: T| v.wrapping_add(&ref_), )?; }; From 6603c3f25992ad5e8f335d45f154e146e4dfb10d Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 21 May 2026 20:33:36 +0000 Subject: [PATCH 8/8] Test bit-packed widening cast through the real engine path Replace the direct-kernel and direct-helper cast tests with a single end-to-end test that drives array.cast(target).execute(), proving the public Vortex path dispatches to BitPacked's widening pushdown across all supported integer pairs, chunk-boundary lengths, and a sliced case. Signed-off-by: "Joe Isaacs" --- .../bitpacking/array/bitpack_decompress.rs | 60 ------------------- .../fastlanes/src/bitpacking/compute/cast.rs | 44 ++++++++++---- 2 files changed, 32 insertions(+), 72 deletions(-) diff --git a/encodings/fastlanes/src/bitpacking/array/bitpack_decompress.rs b/encodings/fastlanes/src/bitpacking/array/bitpack_decompress.rs index 47bcd1120fd..052cda035d1 100644 --- a/encodings/fastlanes/src/bitpacking/array/bitpack_decompress.rs +++ b/encodings/fastlanes/src/bitpacking/array/bitpack_decompress.rs @@ -631,66 +631,6 @@ mod tests { Ok(()) } - /// The cast-pushdown path must produce identical values to canonicalize-then-cast. - #[test] - fn test_unpack_and_cast_matches_canonical_cast() -> VortexResult<()> { - use vortex_array::builtins::ArrayBuiltins; - use vortex_array::dtype::DType; - use vortex_array::dtype::PType; - - let mut ctx = SESSION.create_execution_ctx(); - - // Three full chunks plus a trailer, with values large enough to force patches at width 8. - let values = (0..3500u16).map(|i| if i % 137 == 0 { 5000 + i } else { i % 200 }); - let array = PrimitiveArray::from_iter(values); - let bitpacked = encode(&array, 8); - assert!(bitpacked.patches().is_some()); - - let u32_dtype = DType::Primitive(PType::U32, Nullability::NonNullable); - - // Reference: canonicalize (unpack to u16) then cast to u32. - let reference = bitpacked - .clone() - .into_array() - .execute::(&mut ctx)? - .into_array() - .cast(u32_dtype.clone())? - .execute::(&mut ctx)?; - - // Pushdown: unpack u16 bit-packed directly into a u32 builder. - let mut builder = - PrimitiveBuilder::::with_capacity(Nullability::NonNullable, array.len()); - unpack_and_cast_into_builder::(bitpacked.as_view(), &mut builder, &mut ctx)?; - let pushdown = builder.finish_into_primitive(); - - assert_arrays_eq!(pushdown, reference); - - // Verify a sliced array (offset > 0, trailer present). Use patch-free values so the slice - // reduces eagerly to a concrete `BitPacked` with an offset. - let dense = PrimitiveArray::from_iter((0..3000u16).map(|i| i % 200)); - let dense_bp = encode(&dense, 8); - assert!(dense_bp.patches().is_none()); - let sliced = dense_bp - .into_array() - .slice(500..2600)? - .try_downcast::() - .map_err(|_| vortex_error::vortex_err!("slice did not reduce to BitPacked"))?; - assert!(sliced.offset() > 0); - - let reference_sliced = sliced - .clone() - .into_array() - .execute::(&mut ctx)? - .into_array() - .cast(u32_dtype)? - .execute::(&mut ctx)?; - let mut builder = - PrimitiveBuilder::::with_capacity(Nullability::NonNullable, sliced.len()); - unpack_and_cast_into_builder::(sliced.as_view(), &mut builder, &mut ctx)?; - assert_arrays_eq!(builder.finish_into_primitive(), reference_sliced); - Ok(()) - } - /// Test edge cases for unpacking. #[test] fn test_unpack_edge_cases() -> VortexResult<()> { diff --git a/encodings/fastlanes/src/bitpacking/compute/cast.rs b/encodings/fastlanes/src/bitpacking/compute/cast.rs index 0892227919f..1b5346c48f7 100644 --- a/encodings/fastlanes/src/bitpacking/compute/cast.rs +++ b/encodings/fastlanes/src/bitpacking/compute/cast.rs @@ -125,11 +125,9 @@ mod tests { use vortex_array::dtype::Nullability; use vortex_array::dtype::PType; use vortex_array::match_each_integer_ptype; - use vortex_array::scalar_fn::fns::cast::CastKernel; use vortex_buffer::buffer; use vortex_error::VortexResult; - use crate::BitPacked; use crate::BitPackedArray; use crate::BitPackedData; @@ -171,8 +169,11 @@ mod tests { ); } + /// End-to-end check that the real engine path `array.cast(target).execute()` routes through the + /// bit-packed widening pushdown and matches a plain primitive cast over the same values, across + /// every supported integer pair, several chunk-boundary lengths, and a sliced (offset > 0) case. #[test] - fn test_cast_bitpacked_widening_integer_matrix() -> VortexResult<()> { + fn test_cast_bitpacked_widening_via_execute() -> VortexResult<()> { fn values(len: usize) -> PrimitiveArray { PrimitiveArray::from_iter((0..len).map(|i| { let value = if i % 17 == 0 { 31 } else { i % 8 }; @@ -198,6 +199,7 @@ mod tests { PType::U32, PType::U64, ]; + // Lengths exercise empty, sub-chunk, exact chunk, chunk+1, and multi-chunk-with-trailer. let lengths = [0, 1, 7, 1023, 1024, 1025, 2051]; for src in ptypes { @@ -208,21 +210,39 @@ mod tests { for len in lengths { let source = match_each_integer_ptype!(src, |S| { values::(len) }); - let source_ref = source.clone().into_array(); - let packed = bp(&source_ref, 3); + let source_ref = source.into_array(); let target = DType::Primitive(tgt, Nullability::NonNullable); - let mut ctx = LEGACY_SESSION.create_execution_ctx(); - let casted = - ::cast(packed.as_view(), &target, &mut ctx)? - .expect( - "supported widening integer cast should hit BitPacked CastKernel", - ); + + // Reference: plain primitive cast of the same values. let reference = source_ref - .cast(target)? + .clone() + .cast(target.clone())? .execute::(&mut ctx)?; + // Candidate: bit-pack, then cast through the real engine. This dispatches to + // `BitPacked`'s `CastKernel` widening pushdown. + let packed = bp(&source_ref, 3).into_array(); + let casted = packed + .cast(target.clone())? + .execute::(&mut ctx)?; assert_arrays_eq!(casted, reference); + + // Also exercise the sliced/offset path (offset > 0, trailer present). + if len >= 4 { + let lo = len / 4; + let hi = len - len / 4; + let sliced = bp(&source_ref, 3).into_array().slice(lo..hi)?; + let casted = sliced + .cast(target.clone())? + .execute::(&mut ctx)?; + let reference = source_ref + .clone() + .slice(lo..hi)? + .cast(target.clone())? + .execute::(&mut ctx)?; + assert_arrays_eq!(casted, reference); + } } } }