diff --git a/benchmarks/benches/lib.rs b/benchmarks/benches/lib.rs index 23f6a6c34..b05b9b195 100644 --- a/benchmarks/benches/lib.rs +++ b/benchmarks/benches/lib.rs @@ -1,5 +1,6 @@ use itertools::Itertools; use std::cmp::Reverse; +use std::io::Cursor; use std::ops::{BitAnd, BitAndAssign, BitOr, BitOrAssign, BitXor, BitXorAssign, Sub, SubAssign}; use criterion::measurement::Measurement; @@ -117,6 +118,41 @@ fn pairwise_binary_op_matrix( group.finish(); } +fn pairwise_ops_with_serialized( + c: &mut Criterion, + op_name: &str, + op_ref_own: fn(&RoaringBitmap, &[u8]) -> RoaringBitmap, +) { + let mut group = c.benchmark_group(format!("pairwise_{op_name}")); + + for dataset in Datasets { + let pairs = dataset.bitmaps.iter().cloned().tuple_windows::<(_, _)>().collect::>(); + + group.bench_function(BenchmarkId::new("ref_own", &dataset.name), |b| { + b.iter_batched( + || { + pairs + .iter() + .map(|(a, b)| { + let mut buf = Vec::new(); + b.serialize_into(&mut buf).unwrap(); + (a.clone(), buf) + }) + .collect::>() + }, + |bitmaps| { + for (a, b) in bitmaps { + black_box(op_ref_own(&a, &b)); + } + }, + BatchSize::SmallInput, + ); + }); + } + + group.finish(); +} + fn pairwise_binary_op( group: &mut BenchmarkGroup, op_name: &str, @@ -557,6 +593,18 @@ fn successive_or(c: &mut Criterion) { group.finish(); } +fn intersection_with_serialized(c: &mut Criterion) { + pairwise_ops_with_serialized(c, "intersection_with_serialized_unchecked", |a, b| { + a.intersection_with_serialized_unchecked(Cursor::new(b)).unwrap() + }) +} + +fn union_with_serialized(c: &mut Criterion) { + pairwise_ops_with_serialized(c, "union_with_serialized_unchecked", |a, b| { + a.union_with_serialized_unchecked(Cursor::new(b)).unwrap() + }) +} + // LEGACY BENCHMARKS // ================= @@ -740,6 +788,8 @@ criterion_group!( serialization, deserialization, successive_and, - successive_or + successive_or, + intersection_with_serialized, + union_with_serialized ); criterion_main!(benches); diff --git a/roaring/src/bitmap/inherent.rs b/roaring/src/bitmap/inherent.rs index cbaa6b5da..c419adc75 100644 --- a/roaring/src/bitmap/inherent.rs +++ b/roaring/src/bitmap/inherent.rs @@ -400,15 +400,11 @@ impl RoaringBitmap { pub fn remove(&mut self, value: u32) -> bool { let (key, index) = util::split(value); match self.containers.binary_search_by_key(&key, |c| c.key) { - Ok(loc) => { - if self.containers[loc].remove(index) { - if self.containers[loc].is_empty() { - self.containers.remove(loc); - } - true - } else { - false + Ok(loc) if self.containers[loc].remove(index) => { + if self.containers[loc].is_empty() { + self.containers.remove(loc); } + true } _ => false, } diff --git a/roaring/src/bitmap/ops_with_serialized.rs b/roaring/src/bitmap/ops_with_serialized.rs index 3bae76c0f..290e850db 100644 --- a/roaring/src/bitmap/ops_with_serialized.rs +++ b/roaring/src/bitmap/ops_with_serialized.rs @@ -1,6 +1,7 @@ use bytemuck::cast_slice_mut; use byteorder::{LittleEndian, ReadBytesExt}; use core::convert::Infallible; + use std::error::Error; use std::io::{self, SeekFrom}; use std::mem; @@ -275,6 +276,231 @@ impl RoaringBitmap { Ok(RoaringBitmap { containers }) } + + /// Computes the union between a materialized [`RoaringBitmap`] and a serialized one. + /// + /// This is faster and more space efficient when you only need the union result. + /// It reduces the number of deserialized internal container and therefore + /// the number of allocations and copies of bytes. + /// + /// # Examples + /// + /// ```rust + /// use roaring::RoaringBitmap; + /// use std::io::Cursor; + /// + /// let rb1: RoaringBitmap = (1..4).collect(); + /// let rb2: RoaringBitmap = (3..5).collect(); + /// + /// // Let's say the rb2 bitmap is serialized + /// let mut bytes = Vec::new(); + /// rb2.serialize_into(&mut bytes).unwrap(); + /// let rb2_bytes = Cursor::new(bytes); + /// + /// assert_eq!( + /// rb1.union_with_serialized_unchecked(rb2_bytes).unwrap(), + /// rb1 | rb2, + /// ); + /// ``` + pub fn union_with_serialized_unchecked(&self, other: R) -> io::Result + where + R: io::Read + io::Seek, + { + RoaringBitmap::union_with_serialized_impl::( + self, + other, + |values| Ok(ArrayStore::from_vec_unchecked(values)), + |len, values| Ok(BitmapStore::from_unchecked(len, values)), + ) + } + + fn union_with_serialized_impl( + &self, + mut reader: R, + a: A, + b: B, + ) -> io::Result + where + R: io::Read + io::Seek, + A: Fn(Vec) -> Result, + AErr: Error + Send + Sync + 'static, + B: Fn(u64, Box<[u64; 1024]>) -> Result, + BErr: Error + Send + Sync + 'static, + { + let (size, has_offsets, has_run_containers) = { + let cookie = reader.read_u32::()?; + if cookie == SERIAL_COOKIE_NO_RUNCONTAINER { + (reader.read_u32::()? as usize, true, false) + } else if (cookie as u16) == SERIAL_COOKIE { + let size = ((cookie >> 16) + 1) as usize; + (size, size >= NO_OFFSET_THRESHOLD, true) + } else { + return Err(io::Error::other("unknown cookie value")); + } + }; + + let run_container_bitmap = if has_run_containers { + let mut bitmap = vec![0u8; size.div_ceil(8)]; + reader.read_exact(&mut bitmap)?; + Some(bitmap) + } else { + None + }; + + if size > u16::MAX as usize + 1 { + return Err(io::Error::other("size is greater than supported")); + } + + let mut descriptions = vec![[0; 2]; size]; + reader.read_exact(cast_slice_mut(&mut descriptions))?; + descriptions.iter_mut().for_each(|[key, len]| { + *key = u16::from_le(*key); + *len = u16::from_le(*len); + }); + + if has_offsets { + let mut offsets = vec![0; size]; + reader.read_exact(cast_slice_mut(&mut offsets))?; + offsets.iter_mut().for_each(|offset| *offset = u32::from_le(*offset)); + return self.union_with_serialized_impl_with_offsets( + reader, + a, + b, + &descriptions, + &offsets, + run_container_bitmap.as_deref(), + ); + } + + let mut containers = Vec::new(); + let mut left_containers = self.containers.iter().peekable(); + for (i, &[key, len_minus_one]) in descriptions.iter().enumerate() { + while left_containers.peek().is_some_and(|container| container.key < key) { + containers.push(left_containers.next().unwrap().clone()); + } + + let cardinality = u64::from(len_minus_one) + 1; + let is_run_container = + run_container_bitmap.as_ref().is_some_and(|bm| bm[i / 8] & (1 << (i % 8)) != 0); + + let store = if is_run_container { + let runs = reader.read_u16::()?; + let mut intervals = vec![[0, 0]; runs as usize]; + reader.read_exact(cast_slice_mut(&mut intervals))?; + intervals.iter_mut().for_each(|[s, len]| { + *s = u16::from_le(*s); + *len = u16::from_le(*len); + }); + + let cardinality = intervals.iter().map(|[_, len]| *len as usize).sum(); + let mut store = Store::with_capacity(cardinality); + intervals.into_iter().try_for_each(|[s, len]| -> Result<(), io::ErrorKind> { + let end = s.checked_add(len).ok_or(io::ErrorKind::InvalidData)?; + store.insert_range(RangeInclusive::new(s, end)); + Ok(()) + })?; + store + } else if cardinality <= ARRAY_LIMIT { + let mut values = vec![0; cardinality as usize]; + reader.read_exact(cast_slice_mut(&mut values))?; + values.iter_mut().for_each(|n| *n = u16::from_le(*n)); + let array = a(values).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; + Store::Array(array) + } else { + let mut values = Box::new([0; BITMAP_LENGTH]); + reader.read_exact(cast_slice_mut(&mut values[..]))?; + values.iter_mut().for_each(|n| *n = u64::from_le(*n)); + let bitmap = b(cardinality, values) + .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; + Store::Bitmap(bitmap) + }; + + let mut right_container = Container { key, store }; + if left_containers.peek().is_some_and(|container| container.key == key) { + right_container |= left_containers.next().unwrap(); + } + if !right_container.is_empty() { + containers.push(right_container); + } + } + + containers.extend(left_containers.cloned()); + Ok(RoaringBitmap { containers }) + } + + fn union_with_serialized_impl_with_offsets( + &self, + mut reader: R, + a: A, + b: B, + descriptions: &[[u16; 2]], + offsets: &[u32], + run_container_bitmap: Option<&[u8]>, + ) -> io::Result + where + R: io::Read + io::Seek, + A: Fn(Vec) -> Result, + AErr: Error + Send + Sync + 'static, + B: Fn(u64, Box<[u64; 1024]>) -> Result, + BErr: Error + Send + Sync + 'static, + { + let mut containers = Vec::new(); + let mut left_containers = self.containers.iter().peekable(); + for (i, &[key, len_minus_one]) in descriptions.iter().enumerate() { + while left_containers.peek().is_some_and(|container| container.key < key) { + containers.push(left_containers.next().unwrap().clone()); + } + + reader.seek(SeekFrom::Start(offsets[i] as u64))?; + + let cardinality = u64::from(len_minus_one) + 1; + let is_run_container = + run_container_bitmap.as_ref().is_some_and(|bm| bm[i / 8] & (1 << (i % 8)) != 0); + + let store = if is_run_container { + let runs = reader.read_u16::().unwrap(); + let mut intervals = vec![[0, 0]; runs as usize]; + reader.read_exact(cast_slice_mut(&mut intervals)).unwrap(); + intervals.iter_mut().for_each(|[s, len]| { + *s = u16::from_le(*s); + *len = u16::from_le(*len); + }); + + let cardinality = intervals.iter().map(|[_, len]| *len as usize).sum(); + let mut store = Store::with_capacity(cardinality); + intervals.into_iter().try_for_each(|[s, len]| -> Result<(), io::ErrorKind> { + let end = s.checked_add(len).ok_or(io::ErrorKind::InvalidData)?; + store.insert_range(RangeInclusive::new(s, end)); + Ok(()) + })?; + store + } else if cardinality <= ARRAY_LIMIT { + let mut values = vec![0; cardinality as usize]; + reader.read_exact(cast_slice_mut(&mut values)).unwrap(); + values.iter_mut().for_each(|n| *n = u16::from_le(*n)); + let array = a(values).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; + Store::Array(array) + } else { + let mut values = Box::new([0; BITMAP_LENGTH]); + reader.read_exact(cast_slice_mut(&mut values[..])).unwrap(); + values.iter_mut().for_each(|n| *n = u64::from_le(*n)); + let bitmap = b(cardinality, values) + .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; + Store::Bitmap(bitmap) + }; + + let mut right_container = Container { key, store }; + if left_containers.peek().is_some_and(|container| container.key == key) { + right_container |= left_containers.next().unwrap(); + } + if !right_container.is_empty() { + containers.push(right_container); + } + } + + containers.extend(left_containers.cloned()); + Ok(RoaringBitmap { containers }) + } } #[cfg(test)] @@ -297,4 +523,18 @@ mod test { prop_assert_eq!(a.intersection_with_serialized_unchecked(Cursor::new(serialized_bytes_b)).unwrap(), a & b); } } + + proptest! { + #[test] + fn union_with_serialized_eq_materialized_intersection( + a in RoaringBitmap::arbitrary(), + b in RoaringBitmap::arbitrary() + ) { + let mut serialized_bytes_b = Vec::new(); + b.serialize_into(&mut serialized_bytes_b).unwrap(); + let serialized_bytes_b = &serialized_bytes_b[..]; + + prop_assert_eq!(a.union_with_serialized_unchecked(Cursor::new(serialized_bytes_b)).unwrap(), a | b); + } + } }