Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions encodings/fastlanes/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -63,3 +63,8 @@ required-features = ["_test-harness"]
[[bench]]
name = "bitpack_compare"
harness = false

[[bench]]
name = "cast_bitpacked"
harness = false
required-features = ["_test-harness"]
157 changes: 157 additions & 0 deletions encodings/fastlanes/benches/cast_bitpacked.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright the Vortex contributors

//! Benchmarks the cost of widening a bit-packed narrow integer column to a wider integer type on
//! decompression (e.g. `u16 -> u32`).
//!
//! Three strategies are compared:
//!
//! - `cast_execute`: the real public path, `array.cast(u32).execute()`. With the bit-packed cast
//! pushdown wired into `BitPacked`'s `CastKernel`, this unpacks-and-casts in a single pass.
//! - `canonicalize_then_cast`: explicitly canonicalizes to a full-length `u16` `PrimitiveArray` and
//! then casts that to `u32`. This reproduces the behaviour before the pushdown existed (two
//! full-length buffers, the `u16` intermediate written to RAM and read back, plus the generic
//! primitive cast kernel's bounds-check scan), and serves as the in-run baseline.
//! - `pushdown_helper`: calls the `unpack_and_cast_into_builder` helper directly. This is the floor
//! for the technique, and `cast_execute` should track it once the kernel is wired in.

#![expect(clippy::unwrap_used)]

use std::sync::LazyLock;

use divan::Bencher;
use rand::RngExt;
use rand::SeedableRng;
use rand::prelude::StdRng;
use vortex_array::ArrayRef;
use vortex_array::IntoArray;
use vortex_array::VortexSessionExecute;
use vortex_array::arrays::ChunkedArray;
use vortex_array::arrays::PrimitiveArray;
use vortex_array::builders::PrimitiveBuilder;
use vortex_array::builtins::ArrayBuiltins;
use vortex_array::dtype::DType;
use vortex_array::dtype::Nullability;
use vortex_array::dtype::PType;
use vortex_array::session::ArraySession;
use vortex_array::validity::Validity;
use vortex_buffer::BufferMut;
use vortex_error::VortexExpect;
use vortex_fastlanes::BitPackedArray;
use vortex_fastlanes::BitPackedData;
use vortex_fastlanes::bitpack_decompress::unpack_and_cast_into_builder;
use vortex_session::VortexSession;

fn main() {
divan::main();
}

static SESSION: LazyLock<VortexSession> =
LazyLock::new(|| VortexSession::empty().with::<ArraySession>());

const U32: DType = DType::Primitive(PType::U32, Nullability::NonNullable);

// (chunk_len, chunk_count, fraction_patched)
const ARGS: &[(usize, usize, f64)] = &[
(65_536, 1, 0.00),
(65_536, 1, 0.01),
(65_536, 16, 0.00),
(65_536, 16, 0.01),
(1_048_576, 1, 0.00),
(1_048_576, 1, 0.01),
];

/// Build a single bit-packed `u16` chunk. Most values fit in `bit_width` bits; `fraction_patched`
/// of them are large enough to require patches.
fn make_chunk(rng: &mut StdRng, len: usize, fraction_patched: f64) -> BitPackedArray {
let bit_width = 9u8;
let cap = 1u16 << bit_width;
let values = (0..len)
.map(|_| {
if rng.random_bool(fraction_patched) {
rng.random_range(cap..u16::MAX)
} else {
rng.random_range(0..cap)
}
})
.collect::<BufferMut<u16>>();
let array = PrimitiveArray::new(values, Validity::NonNullable);
BitPackedData::encode(
&array.into_array(),
bit_width,
&mut SESSION.create_execution_ctx(),
)
.vortex_expect("encode")
}

fn make_chunks(len: usize, count: usize, fraction_patched: f64) -> Vec<BitPackedArray> {
let mut rng = StdRng::seed_from_u64(0);
(0..count)
.map(|_| make_chunk(&mut rng, len, fraction_patched))
.collect()
}

fn single(chunks: &[BitPackedArray]) -> ArrayRef {
if chunks.len() == 1 {
chunks[0].clone().into_array()
} else {
ChunkedArray::from_iter(chunks.iter().map(|c| c.clone().into_array())).into_array()
}
}

/// The real public path: `array.cast(u32).execute()`. Hits the bit-packed cast pushdown kernel.
#[cfg(not(codspeed))]
#[divan::bench(args = ARGS)]
fn cast_execute(bencher: Bencher, (chunk_len, chunk_count, frac): (usize, usize, f64)) {
let chunks = make_chunks(chunk_len, chunk_count, frac);
bencher
.with_inputs(|| (single(&chunks), SESSION.create_execution_ctx()))
.bench_refs(|(array, ctx)| {
array
.clone()
.cast(U32)
.unwrap()
.execute::<PrimitiveArray>(ctx)
.unwrap()
});
}

/// Baseline: canonicalize to a full-length `u16` array, then cast that primitive array to `u32`.
/// Reproduces the pre-pushdown behaviour.
#[cfg(not(codspeed))]
#[divan::bench(args = ARGS)]
fn canonicalize_then_cast(bencher: Bencher, (chunk_len, chunk_count, frac): (usize, usize, f64)) {
let chunks = make_chunks(chunk_len, chunk_count, frac);
bencher
.with_inputs(|| (single(&chunks), SESSION.create_execution_ctx()))
.bench_refs(|(array, ctx)| {
let canonical = array.clone().execute::<PrimitiveArray>(ctx).unwrap();
canonical
.into_array()
.cast(U32)
.unwrap()
.execute::<PrimitiveArray>(ctx)
.unwrap()
});
}

#[cfg(not(codspeed))]
#[divan::bench(args = ARGS)]
fn pushdown_helper(bencher: Bencher, (chunk_len, chunk_count, frac): (usize, usize, f64)) {
let chunks = make_chunks(chunk_len, chunk_count, frac);
let total = chunk_len * chunk_count;
bencher
.with_inputs(|| {
(
chunks.clone(),
PrimitiveBuilder::<u32>::with_capacity(Nullability::NonNullable, total),
SESSION.create_execution_ctx(),
)
})
.bench_refs(|(chunks, builder, ctx)| {
for chunk in chunks.iter() {
unpack_and_cast_into_builder::<u16, u32>(chunk.as_view(), builder, ctx).unwrap();
}
builder.finish_into_primitive()
});
}
8 changes: 5 additions & 3 deletions encodings/fastlanes/public-api.lock
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,12 @@ pub fn vortex_fastlanes::bitpack_compress::gather_patches(&vortex_array::arrays:

pub mod vortex_fastlanes::bitpack_decompress

pub fn vortex_fastlanes::bitpack_decompress::apply_patches_to_uninit_range<T: vortex_array::dtype::ptype::NativePType>(&mut vortex_array::builders::primitive::UninitRange<'_, T>, &vortex_array::patches::Patches, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<()>

pub fn vortex_fastlanes::bitpack_decompress::apply_patches_to_uninit_range_fn<T: vortex_array::dtype::ptype::NativePType, F: core::ops::function::Fn(T) -> T>(&mut vortex_array::builders::primitive::UninitRange<'_, T>, &vortex_array::patches::Patches, &mut vortex_array::executor::ExecutionCtx, F) -> vortex_error::VortexResult<()>
pub fn vortex_fastlanes::bitpack_decompress::apply_patches_to_uninit_range_fn<S: vortex_array::dtype::ptype::NativePType, T: vortex_array::dtype::ptype::NativePType, Fun: core::ops::function::Fn(S) -> T>(&mut vortex_array::builders::primitive::UninitRange<'_, T>, &vortex_array::patches::Patches, &mut vortex_array::executor::ExecutionCtx, Fun) -> vortex_error::VortexResult<()>

pub fn vortex_fastlanes::bitpack_decompress::count_exceptions(u8, &[usize]) -> usize

pub fn vortex_fastlanes::bitpack_decompress::unpack_and_cast_into_builder<F, T>(vortex_array::array::view::ArrayView<'_, vortex_fastlanes::BitPacked>, &mut vortex_array::builders::primitive::PrimitiveBuilder<T>, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<()> where F: vortex_fastlanes::unpack_iter::BitPacked + num_traits::cast::AsPrimitive<T>, T: vortex_array::dtype::ptype::NativePType

pub fn vortex_fastlanes::bitpack_decompress::unpack_array(vortex_array::array::view::ArrayView<'_, vortex_fastlanes::BitPacked>, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<vortex_array::arrays::primitive::vtable::PrimitiveArray>

pub fn vortex_fastlanes::bitpack_decompress::unpack_primitive_array<T: vortex_fastlanes::unpack_iter::BitPacked>(vortex_array::array::view::ArrayView<'_, vortex_fastlanes::BitPacked>, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<vortex_array::arrays::primitive::vtable::PrimitiveArray>
Expand Down Expand Up @@ -84,6 +84,8 @@ pub fn vortex_fastlanes::unpack_iter::UnpackedChunks<T, S>::try_new_with_strateg

impl<T: vortex_fastlanes::unpack_iter::BitPacked> vortex_fastlanes::unpack_iter::UnpackedChunks<T, vortex_fastlanes::unpack_iter::BitPackingStrategy>

pub fn vortex_fastlanes::unpack_iter::UnpackedChunks<T, vortex_fastlanes::unpack_iter::BitPackingStrategy>::decode_cast_into<U: core::marker::Copy>(&mut self, &mut [core::mem::maybe_uninit::MaybeUninit<U>], impl core::ops::function::Fn(T) -> U)

pub fn vortex_fastlanes::unpack_iter::UnpackedChunks<T, vortex_fastlanes::unpack_iter::BitPackingStrategy>::full_chunks(&mut self) -> vortex_fastlanes::unpack_iter::BitUnpackIterator<'_, T>

pub fn vortex_fastlanes::unpack_iter::UnpackedChunks<T, vortex_fastlanes::unpack_iter::BitPackingStrategy>::try_new(&vortex_fastlanes::BitPackedData, usize) -> vortex_error::VortexResult<Self>
Expand Down
62 changes: 53 additions & 9 deletions encodings/fastlanes/src/bitpacking/array/bitpack_decompress.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ pub(crate) fn unpack_into_primitive_builder<T: BitPackedUnpack>(
bit_packed_iter.decode_into(uninit_slice);

if let Some(patches) = array.patches() {
apply_patches_to_uninit_range(&mut uninit_range, &patches, ctx)?;
apply_patches_to_uninit_range_fn(&mut uninit_range, &patches, ctx, |v: T| v)?;
};

// SAFETY: We have set a correct validity mask via `append_mask` with `array.len()` values and
Expand All @@ -79,26 +79,70 @@ pub(crate) fn unpack_into_primitive_builder<T: BitPackedUnpack>(
Ok(())
}

pub fn apply_patches_to_uninit_range<T: NativePType>(
dst: &mut UninitRange<T>,
patches: &Patches,
/// Unpack a bit-packed array of physical type `F` directly into a wider primitive type `T`,
/// casting each value during decompression.
///
/// This is the "cast pushdown" path: rather than canonicalizing to a full-length `F`-typed
/// `PrimitiveArray` and then casting it to `T` (two full-length buffers, with the `F` intermediate
/// written out to RAM), we unpack each 1024-element FastLanes chunk into a small cache-resident
/// scratch buffer and cast-copy straight into the `T` output. Only the `T` output buffer is
/// allocated and touched in RAM.
///
/// The caller must ensure all valid values are representable in `T` (it is intended for widening
/// casts such as `u16 -> u32`); narrowing or sign-changing casts are not validated here.
pub fn unpack_and_cast_into_builder<F, T>(
array: ArrayView<'_, BitPacked>,
builder: &mut PrimitiveBuilder<T>,
ctx: &mut ExecutionCtx,
) -> VortexResult<()> {
apply_patches_to_uninit_range_fn(dst, patches, ctx, |x| x)
) -> VortexResult<()>
where
F: BitPackedUnpack + AsPrimitive<T>,
T: NativePType,
{
if array.is_empty() {
return Ok(());
}

let len = array.len();
let mut uninit_range = builder.uninit_range(len);

// SAFETY: We initialize all `len` values below via `decode_map_into` and the patch loop.
unsafe {
uninit_range.append_mask(array.validity()?.execute_mask(len, ctx)?);
}

// SAFETY: `decode_map_into` writes a value to every slot in this range.
let uninit_slice = unsafe { uninit_range.slice_uninit_mut(0, len) };

let mut chunks = array.unpacked_chunks::<F>()?;
chunks.decode_map_into(uninit_slice, |v: F| v.as_());

if let Some(patches) = array.patches() {
apply_patches_to_uninit_range_fn(&mut uninit_range, &patches, ctx, |v: F| v.as_())?;
}

// SAFETY: A correct validity mask of `len` values was set via `append_mask`, and the same
// number of values was initialized via `decode_cast_into` (and overwritten by patches).
unsafe {
uninit_range.finish();
}
Ok(())
}

pub fn apply_patches_to_uninit_range_fn<T: NativePType, F: Fn(T) -> T>(
/// Applies the patches to the uninitialized range, casting each stored patch value of physical
/// type `S` to the output type `T` via the identity-or-widening map `f`.
pub fn apply_patches_to_uninit_range_fn<S: NativePType, T: NativePType, Fun: Fn(S) -> T>(
dst: &mut UninitRange<T>,
patches: &Patches,
ctx: &mut ExecutionCtx,
f: F,
f: Fun,
) -> VortexResult<()> {
assert_eq!(patches.array_len(), dst.len());

let indices = patches.indices().clone().execute::<PrimitiveArray>(ctx)?;
let values = patches.values().clone().execute::<PrimitiveArray>(ctx)?;
assert!(values.all_valid(ctx)?, "Patch values must be all valid");
let values = values.as_slice::<T>();
let values = values.as_slice::<S>();

match_each_unsigned_integer_ptype!(indices.ptype(), |P| {
for (index, &value) in indices.as_slice::<P>().iter().zip_eq(values) {
Expand Down
Loading
Loading