Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 51 additions & 1 deletion benchmarks/benches/lib.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use itertools::Itertools;
use std::cmp::Reverse;
use std::io::Cursor;
use std::ops::{BitAnd, BitAndAssign, BitOr, BitOrAssign, BitXor, BitXorAssign, Sub, SubAssign};

use criterion::measurement::Measurement;
Expand Down Expand Up @@ -117,6 +118,41 @@ fn pairwise_binary_op_matrix(
group.finish();
}

fn pairwise_ops_with_serialized(
c: &mut Criterion,
op_name: &str,
op_ref_own: fn(&RoaringBitmap, &[u8]) -> RoaringBitmap,
) {
let mut group = c.benchmark_group(format!("pairwise_{op_name}"));

for dataset in Datasets {
let pairs = dataset.bitmaps.iter().cloned().tuple_windows::<(_, _)>().collect::<Vec<_>>();

group.bench_function(BenchmarkId::new("ref_own", &dataset.name), |b| {
b.iter_batched(
|| {
pairs
.iter()
.map(|(a, b)| {
let mut buf = Vec::new();
b.serialize_into(&mut buf).unwrap();
(a.clone(), buf)
})
.collect::<Vec<_>>()
},
|bitmaps| {
for (a, b) in bitmaps {
black_box(op_ref_own(&a, &b));
}
},
BatchSize::SmallInput,
);
});
}

group.finish();
}

fn pairwise_binary_op<R, M: Measurement>(
group: &mut BenchmarkGroup<M>,
op_name: &str,
Expand Down Expand Up @@ -557,6 +593,18 @@ fn successive_or(c: &mut Criterion) {
group.finish();
}

fn intersection_with_serialized(c: &mut Criterion) {
pairwise_ops_with_serialized(c, "intersection_with_serialized_unchecked", |a, b| {
a.intersection_with_serialized_unchecked(Cursor::new(b)).unwrap()
})
}

fn union_with_serialized(c: &mut Criterion) {
pairwise_ops_with_serialized(c, "union_with_serialized_unchecked", |a, b| {
a.union_with_serialized_unchecked(Cursor::new(b)).unwrap()
})
}

// LEGACY BENCHMARKS
// =================

Expand Down Expand Up @@ -740,6 +788,8 @@ criterion_group!(
serialization,
deserialization,
successive_and,
successive_or
successive_or,
intersection_with_serialized,
union_with_serialized
);
criterion_main!(benches);
12 changes: 4 additions & 8 deletions roaring/src/bitmap/inherent.rs
Original file line number Diff line number Diff line change
Expand Up @@ -400,15 +400,11 @@ impl RoaringBitmap {
pub fn remove(&mut self, value: u32) -> bool {
let (key, index) = util::split(value);
match self.containers.binary_search_by_key(&key, |c| c.key) {
Ok(loc) => {
if self.containers[loc].remove(index) {
if self.containers[loc].is_empty() {
self.containers.remove(loc);
}
true
} else {
false
Ok(loc) if self.containers[loc].remove(index) => {
if self.containers[loc].is_empty() {
self.containers.remove(loc);
}
true
}
_ => false,
}
Expand Down
240 changes: 240 additions & 0 deletions roaring/src/bitmap/ops_with_serialized.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use bytemuck::cast_slice_mut;
use byteorder::{LittleEndian, ReadBytesExt};
use core::convert::Infallible;

use std::error::Error;
use std::io::{self, SeekFrom};
use std::mem;
Expand Down Expand Up @@ -275,6 +276,231 @@ impl RoaringBitmap {

Ok(RoaringBitmap { containers })
}

/// Computes the union between a materialized [`RoaringBitmap`] and a serialized one.
///
/// This is faster and more space efficient when you only need the union result.
/// It reduces the number of deserialized internal container and therefore
/// the number of allocations and copies of bytes.
///
/// # Examples
///
/// ```rust
/// use roaring::RoaringBitmap;
/// use std::io::Cursor;
///
/// let rb1: RoaringBitmap = (1..4).collect();
/// let rb2: RoaringBitmap = (3..5).collect();
///
/// // Let's say the rb2 bitmap is serialized
/// let mut bytes = Vec::new();
/// rb2.serialize_into(&mut bytes).unwrap();
/// let rb2_bytes = Cursor::new(bytes);
///
/// assert_eq!(
/// rb1.union_with_serialized_unchecked(rb2_bytes).unwrap(),
/// rb1 | rb2,
/// );
/// ```
pub fn union_with_serialized_unchecked<R>(&self, other: R) -> io::Result<RoaringBitmap>
where
R: io::Read + io::Seek,
{
RoaringBitmap::union_with_serialized_impl::<R, _, Infallible, _, Infallible>(
self,
other,
|values| Ok(ArrayStore::from_vec_unchecked(values)),
|len, values| Ok(BitmapStore::from_unchecked(len, values)),
)
}

fn union_with_serialized_impl<R, A, AErr, B, BErr>(
&self,
mut reader: R,
a: A,
b: B,
) -> io::Result<RoaringBitmap>
where
R: io::Read + io::Seek,
A: Fn(Vec<u16>) -> Result<ArrayStore, AErr>,
AErr: Error + Send + Sync + 'static,
B: Fn(u64, Box<[u64; 1024]>) -> Result<BitmapStore, BErr>,
BErr: Error + Send + Sync + 'static,
{
let (size, has_offsets, has_run_containers) = {
let cookie = reader.read_u32::<LittleEndian>()?;
if cookie == SERIAL_COOKIE_NO_RUNCONTAINER {
(reader.read_u32::<LittleEndian>()? as usize, true, false)
} else if (cookie as u16) == SERIAL_COOKIE {
let size = ((cookie >> 16) + 1) as usize;
(size, size >= NO_OFFSET_THRESHOLD, true)
} else {
return Err(io::Error::other("unknown cookie value"));
}
};

let run_container_bitmap = if has_run_containers {
let mut bitmap = vec![0u8; size.div_ceil(8)];
reader.read_exact(&mut bitmap)?;
Some(bitmap)
} else {
None
};

if size > u16::MAX as usize + 1 {
return Err(io::Error::other("size is greater than supported"));
}

let mut descriptions = vec![[0; 2]; size];
reader.read_exact(cast_slice_mut(&mut descriptions))?;
descriptions.iter_mut().for_each(|[key, len]| {
*key = u16::from_le(*key);
*len = u16::from_le(*len);
});

if has_offsets {
let mut offsets = vec![0; size];
reader.read_exact(cast_slice_mut(&mut offsets))?;
offsets.iter_mut().for_each(|offset| *offset = u32::from_le(*offset));
return self.union_with_serialized_impl_with_offsets(
reader,
a,
b,
&descriptions,
&offsets,
run_container_bitmap.as_deref(),
);
}

let mut containers = Vec::new();
let mut left_containers = self.containers.iter().peekable();
for (i, &[key, len_minus_one]) in descriptions.iter().enumerate() {
while left_containers.peek().is_some_and(|container| container.key < key) {
containers.push(left_containers.next().unwrap().clone());
}

let cardinality = u64::from(len_minus_one) + 1;
let is_run_container =
run_container_bitmap.as_ref().is_some_and(|bm| bm[i / 8] & (1 << (i % 8)) != 0);

let store = if is_run_container {
let runs = reader.read_u16::<LittleEndian>()?;
let mut intervals = vec![[0, 0]; runs as usize];
reader.read_exact(cast_slice_mut(&mut intervals))?;
intervals.iter_mut().for_each(|[s, len]| {
*s = u16::from_le(*s);
*len = u16::from_le(*len);
});

let cardinality = intervals.iter().map(|[_, len]| *len as usize).sum();
let mut store = Store::with_capacity(cardinality);
intervals.into_iter().try_for_each(|[s, len]| -> Result<(), io::ErrorKind> {
let end = s.checked_add(len).ok_or(io::ErrorKind::InvalidData)?;
store.insert_range(RangeInclusive::new(s, end));
Ok(())
})?;
store
} else if cardinality <= ARRAY_LIMIT {
let mut values = vec![0; cardinality as usize];
reader.read_exact(cast_slice_mut(&mut values))?;
values.iter_mut().for_each(|n| *n = u16::from_le(*n));
let array = a(values).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
Store::Array(array)
} else {
let mut values = Box::new([0; BITMAP_LENGTH]);
reader.read_exact(cast_slice_mut(&mut values[..]))?;
values.iter_mut().for_each(|n| *n = u64::from_le(*n));
let bitmap = b(cardinality, values)
.map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
Store::Bitmap(bitmap)
};

let mut right_container = Container { key, store };
if left_containers.peek().is_some_and(|container| container.key == key) {
right_container |= left_containers.next().unwrap();
}
if !right_container.is_empty() {
containers.push(right_container);
}
}

containers.extend(left_containers.cloned());
Ok(RoaringBitmap { containers })
}

fn union_with_serialized_impl_with_offsets<R, A, AErr, B, BErr>(
&self,
mut reader: R,
a: A,
b: B,
descriptions: &[[u16; 2]],
offsets: &[u32],
run_container_bitmap: Option<&[u8]>,
) -> io::Result<RoaringBitmap>
where
R: io::Read + io::Seek,
A: Fn(Vec<u16>) -> Result<ArrayStore, AErr>,
AErr: Error + Send + Sync + 'static,
B: Fn(u64, Box<[u64; 1024]>) -> Result<BitmapStore, BErr>,
BErr: Error + Send + Sync + 'static,
{
let mut containers = Vec::new();
let mut left_containers = self.containers.iter().peekable();
for (i, &[key, len_minus_one]) in descriptions.iter().enumerate() {
while left_containers.peek().is_some_and(|container| container.key < key) {
containers.push(left_containers.next().unwrap().clone());
}

reader.seek(SeekFrom::Start(offsets[i] as u64))?;

let cardinality = u64::from(len_minus_one) + 1;
let is_run_container =
run_container_bitmap.as_ref().is_some_and(|bm| bm[i / 8] & (1 << (i % 8)) != 0);

let store = if is_run_container {
let runs = reader.read_u16::<LittleEndian>().unwrap();
let mut intervals = vec![[0, 0]; runs as usize];
reader.read_exact(cast_slice_mut(&mut intervals)).unwrap();
intervals.iter_mut().for_each(|[s, len]| {
*s = u16::from_le(*s);
*len = u16::from_le(*len);
});

let cardinality = intervals.iter().map(|[_, len]| *len as usize).sum();
let mut store = Store::with_capacity(cardinality);
intervals.into_iter().try_for_each(|[s, len]| -> Result<(), io::ErrorKind> {
let end = s.checked_add(len).ok_or(io::ErrorKind::InvalidData)?;
store.insert_range(RangeInclusive::new(s, end));
Ok(())
})?;
store
} else if cardinality <= ARRAY_LIMIT {
let mut values = vec![0; cardinality as usize];
reader.read_exact(cast_slice_mut(&mut values)).unwrap();
values.iter_mut().for_each(|n| *n = u16::from_le(*n));
let array = a(values).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
Store::Array(array)
} else {
let mut values = Box::new([0; BITMAP_LENGTH]);
reader.read_exact(cast_slice_mut(&mut values[..])).unwrap();
values.iter_mut().for_each(|n| *n = u64::from_le(*n));
let bitmap = b(cardinality, values)
.map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
Store::Bitmap(bitmap)
};

let mut right_container = Container { key, store };
if left_containers.peek().is_some_and(|container| container.key == key) {
right_container |= left_containers.next().unwrap();
}
if !right_container.is_empty() {
containers.push(right_container);
}
}

containers.extend(left_containers.cloned());
Ok(RoaringBitmap { containers })
}
}

#[cfg(test)]
Expand All @@ -297,4 +523,18 @@ mod test {
prop_assert_eq!(a.intersection_with_serialized_unchecked(Cursor::new(serialized_bytes_b)).unwrap(), a & b);
}
}

proptest! {
#[test]
fn union_with_serialized_eq_materialized_intersection(
a in RoaringBitmap::arbitrary(),
b in RoaringBitmap::arbitrary()
) {
let mut serialized_bytes_b = Vec::new();
b.serialize_into(&mut serialized_bytes_b).unwrap();
let serialized_bytes_b = &serialized_bytes_b[..];

prop_assert_eq!(a.union_with_serialized_unchecked(Cursor::new(serialized_bytes_b)).unwrap(), a | b);
}
}
}