From cc2e0c114ea72724ca12c5c11d52e26cee56bc79 Mon Sep 17 00:00:00 2001 From: coldWater Date: Fri, 16 Jan 2026 11:35:53 +0800 Subject: [PATCH 1/6] bench --- benchmarks/benches/lib.rs | 45 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) diff --git a/benchmarks/benches/lib.rs b/benchmarks/benches/lib.rs index 23f6a6c34..d4908c864 100644 --- a/benchmarks/benches/lib.rs +++ b/benchmarks/benches/lib.rs @@ -1,5 +1,6 @@ use itertools::Itertools; use std::cmp::Reverse; +use std::io::Cursor; use std::ops::{BitAnd, BitAndAssign, BitOr, BitOrAssign, BitXor, BitXorAssign, Sub, SubAssign}; use criterion::measurement::Measurement; @@ -117,6 +118,41 @@ fn pairwise_binary_op_matrix( group.finish(); } +fn pairwise_ops_with_serialized( + c: &mut Criterion, + op_name: &str, + op_ref_own: fn(&RoaringBitmap, &[u8]) -> RoaringBitmap, +) { + let mut group = c.benchmark_group(format!("pairwise_{op_name}")); + + for dataset in Datasets { + let pairs = dataset.bitmaps.iter().cloned().tuple_windows::<(_, _)>().collect::>(); + + group.bench_function(BenchmarkId::new("ref_own", &dataset.name), |b| { + b.iter_batched( + || { + pairs + .iter() + .map(|(a, b)| { + let mut buf = Vec::new(); + b.serialize_into(&mut buf).unwrap(); + (a.clone(), buf) + }) + .collect::>() + }, + |bitmaps| { + for (a, b) in bitmaps { + black_box(op_ref_own(&a, &b)); + } + }, + BatchSize::SmallInput, + ); + }); + } + + group.finish(); +} + fn pairwise_binary_op( group: &mut BenchmarkGroup, op_name: &str, @@ -557,6 +593,12 @@ fn successive_or(c: &mut Criterion) { group.finish(); } +fn intersection_with_serialized(c: &mut Criterion) { + pairwise_ops_with_serialized(c, "intersection_with_serialized_unchecked", |a, b| { + a.intersection_with_serialized_unchecked(Cursor::new(b)).unwrap() + }) +} + // LEGACY BENCHMARKS // ================= @@ -740,6 +782,7 @@ criterion_group!( serialization, deserialization, successive_and, - successive_or + successive_or, + intersection_with_serialized, ); criterion_main!(benches); From a0457176d3582657e4223f1fcd3459a57f1758a3 Mon Sep 17 00:00:00 2001 From: coldWater Date: Thu, 25 Dec 2025 14:26:36 +0800 Subject: [PATCH 2/6] feat: union_with_serialized_unchecked --- roaring/src/bitmap/ops_with_serialized.rs | 534 +++++++++++++--------- 1 file changed, 328 insertions(+), 206 deletions(-) diff --git a/roaring/src/bitmap/ops_with_serialized.rs b/roaring/src/bitmap/ops_with_serialized.rs index 3bae76c0f..d560eba14 100644 --- a/roaring/src/bitmap/ops_with_serialized.rs +++ b/roaring/src/bitmap/ops_with_serialized.rs @@ -1,10 +1,9 @@ use bytemuck::cast_slice_mut; use byteorder::{LittleEndian, ReadBytesExt}; -use core::convert::Infallible; -use std::error::Error; + +use std::cmp::Ordering; use std::io::{self, SeekFrom}; use std::mem; -use std::ops::RangeInclusive; use crate::bitmap::container::Container; use crate::bitmap::serialization::{ @@ -41,239 +40,348 @@ impl RoaringBitmap { /// rb1 & rb2, /// ); /// ``` - pub fn intersection_with_serialized_unchecked(&self, other: R) -> io::Result + pub fn intersection_with_serialized_unchecked( + &self, + mut other: R, + ) -> io::Result where R: io::Read + io::Seek, { - RoaringBitmap::intersection_with_serialized_impl::( - self, - other, - |values| Ok(ArrayStore::from_vec_unchecked(values)), - |len, values| Ok(BitmapStore::from_unchecked(len, values)), - ) + let metadata = BitmapReader::decode(&mut other)?; + let containers = Visitor { + containers: &self.containers, + metadata: &metadata, + handler: &mut BitAndHandler, + } + .visit(&mut other)?; + Ok(RoaringBitmap { containers }) } - fn intersection_with_serialized_impl( - &self, - mut reader: R, - a: A, - b: B, - ) -> io::Result + /// Computes the union between a materialized [`RoaringBitmap`] and a serialized one. + /// + /// This is faster and more space efficient when you only need the union result. + /// It reduces the number of deserialized internal container and therefore + /// the number of allocations and copies of bytes. + /// + /// # Examples + /// + /// ```rust + /// use roaring::RoaringBitmap; + /// use std::io::Cursor; + /// + /// let rb1: RoaringBitmap = (1..4).collect(); + /// let rb2: RoaringBitmap = (3..5).collect(); + /// + /// // Let's say the rb2 bitmap is serialized + /// let mut bytes = Vec::new(); + /// rb2.serialize_into(&mut bytes).unwrap(); + /// let rb2_bytes = Cursor::new(bytes); + /// + /// assert_eq!( + /// rb1.union_with_serialized_unchecked(rb2_bytes).unwrap(), + /// rb1 | rb2, + /// ); + /// ``` + pub fn union_with_serialized_unchecked(&self, mut other: R) -> io::Result + where + R: io::Read + io::Seek, + { + let metadata = BitmapReader::decode(&mut other)?; + let containers = Visitor { + containers: &self.containers, + metadata: &metadata, + handler: &mut BitOrHandler, + } + .visit(&mut other)?; + Ok(RoaringBitmap { containers }) + } +} + +struct Visitor<'a, H> { + containers: &'a [Container], + metadata: &'a BitmapReader, + handler: &'a mut H, +} + +impl Visitor<'_, H> +where + H: VisitorHandler, +{ + fn visit(&mut self, reader: &mut R) -> io::Result> + where + R: io::Read + io::Seek, + { + let mut result = Vec::new(); + let mut descriptions = self + .metadata + .descriptions + .iter() + .enumerate() + .map(|(i, &[key, len_minus_one])| MetaItem { + key, + cardinality: len_minus_one as u32 + 1, + is_run: self.metadata.is_run_container(i), + offset: self.metadata.offsets.as_ref().map(|offsets| offsets[i]), + }) + .peekable(); + let mut containers = self.containers.iter().peekable(); + + loop { + match (containers.peek(), descriptions.peek()) { + (Some(container), Some(item)) => match item.key.cmp(&container.key) { + Ordering::Equal => { + result.extend(self.consume_matched(reader, container, item)?); + descriptions.next(); + containers.next(); + } + Ordering::Less => { + result.extend(self.consume_right(reader, item)?); + descriptions.next(); + } + Ordering::Greater => { + result.extend(self.consume_left(container)?); + containers.next(); + } + }, + (None, Some(item)) => { + result.extend(self.consume_right(reader, item)?); + descriptions.next(); + } + (Some(container), None) => { + result.extend(self.consume_left(container)?); + containers.next(); + } + (None, None) => { + return Ok(result); + } + } + } + } + + fn consume_left(&mut self, container: &Container) -> io::Result> { + self.handler.handle_left_only(container) + } + + fn consume_right(&mut self, reader: &mut R, item: &MetaItem) -> io::Result> + where + R: io::Read + io::Seek, + { + if self.handler.need_handle_right_only(item.key) { + let container = item.load_container(reader)?; + self.handler.handle_right_only(container) + } else if item.offset.is_some() { + Ok(None) + } else { + item.skip(reader)?; + Ok(None) + } + } + + fn consume_matched( + &mut self, + reader: &mut R, + left: &Container, + item: &MetaItem, + ) -> io::Result> where R: io::Read + io::Seek, - A: Fn(Vec) -> Result, - AErr: Error + Send + Sync + 'static, - B: Fn(u64, Box<[u64; 1024]>) -> Result, - BErr: Error + Send + Sync + 'static, { - // First read the cookie to determine which version of the format we are reading + if !self.handler.need_handle_matched(left) { + if item.offset.is_none() { + item.skip(reader)?; + } + return Ok(None); + } + + if let Some(offset) = item.offset { + let absolute_offset = self + .metadata + .base_offset + .checked_add(offset as u64) + .ok_or_else(|| io::Error::other("offset overflow"))?; + reader.seek(SeekFrom::Start(absolute_offset))?; + } + let right = item.load_container(reader)?; + self.handler.handel_matched(left, right) + } +} + +struct MetaItem { + key: u16, + cardinality: u32, + is_run: bool, + offset: Option, +} + +impl MetaItem { + fn load_container(&self, reader: &mut R) -> io::Result { + let store = if self.is_run { + let runs = reader.read_u16::()?; + let mut intervals = vec![[0_u16, 0]; runs as usize]; + reader.read_u16_into::(cast_slice_mut(&mut intervals))?; + + let cardinality = intervals.iter().map(|[_, len]| *len as usize).sum(); + let mut store = Store::with_capacity(cardinality); + + for [s, len] in intervals { + let end = s.checked_add(len).ok_or(io::ErrorKind::InvalidData)?; + store.insert_range(s..=end); + } + store + } else if self.cardinality as u64 <= ARRAY_LIMIT { + let mut values = vec![0; self.cardinality as usize]; + reader.read_u16_into::(&mut values)?; + let array = ArrayStore::from_vec_unchecked(values); + Store::Array(array) + } else { + let mut values = Box::new([0; BITMAP_LENGTH]); + reader.read_u64_into::(values.as_mut_slice())?; + let bitmap = BitmapStore::from_unchecked(self.cardinality as u64, values); + Store::Bitmap(bitmap) + }; + Ok(Container { key: self.key, store }) + } + + fn skip(&self, reader: &mut R) -> io::Result<()> { + if self.is_run { + let runs = reader.read_u16::()?; + let runs_size = mem::size_of::() * 2 * runs as usize; + reader.seek_relative(runs_size as i64)?; + } else if self.cardinality as u64 <= ARRAY_LIMIT { + let array_size = mem::size_of::() * self.cardinality as usize; + reader.seek_relative(array_size as i64)?; + } else { + let bitmap_size = mem::size_of::() * BITMAP_LENGTH; + reader.seek_relative(bitmap_size as i64)?; + } + Ok(()) + } +} + +trait VisitorHandler { + fn handle_left_only(&mut self, container: &Container) -> io::Result>; + + fn need_handle_right_only(&mut self, _key: u16) -> bool { + false + } + + fn handle_right_only(&mut self, _container: Container) -> io::Result> { + unreachable!() + } + + fn need_handle_matched(&mut self, _container: &Container) -> bool { + true + } + + fn handel_matched( + &mut self, + left: &Container, + right: Container, + ) -> io::Result>; +} + +struct BitAndHandler; + +impl VisitorHandler for BitAndHandler { + fn handle_left_only(&mut self, _container: &Container) -> io::Result> { + Ok(None) + } + + fn handel_matched( + &mut self, + left: &Container, + mut right: Container, + ) -> io::Result> { + right &= left; + if right.is_empty() { + Ok(None) + } else { + Ok(Some(right)) + } + } +} + +struct BitOrHandler; + +impl VisitorHandler for BitOrHandler { + fn handle_left_only(&mut self, container: &Container) -> io::Result> { + Ok(Some(container.clone())) + } + + fn need_handle_right_only(&mut self, _key: u16) -> bool { + true + } + + fn handle_right_only(&mut self, container: Container) -> io::Result> { + Ok(Some(container)) + } + + fn handel_matched( + &mut self, + left: &Container, + mut right: Container, + ) -> io::Result> { + right |= left; + if right.is_empty() { + Ok(None) + } else { + Ok(Some(right)) + } + } +} + +#[derive(Debug, Clone)] +struct BitmapReader { + base_offset: u64, + descriptions: Box<[[u16; 2]]>, + offsets: Option>, + run_container_bitmap: Option>, +} + +impl BitmapReader { + pub fn decode(reader: &mut R) -> io::Result { + let base_offset = reader.stream_position()?; + let (size, has_offsets, has_run_containers) = { let cookie = reader.read_u32::()?; if cookie == SERIAL_COOKIE_NO_RUNCONTAINER { (reader.read_u32::()? as usize, true, false) } else if (cookie as u16) == SERIAL_COOKIE { - let size = ((cookie >> 16) + 1) as usize; + let size = (cookie >> 16) as usize + 1; (size, size >= NO_OFFSET_THRESHOLD, true) } else { return Err(io::Error::other("unknown cookie value")); } }; - // Read the run container bitmap if necessary + if size > u16::MAX as usize + 1 { + return Err(io::Error::other("size is greater than supported")); + } + let run_container_bitmap = if has_run_containers { - let mut bitmap = vec![0u8; size.div_ceil(8)]; + let mut bitmap = vec![0u8; size.div_ceil(8)].into_boxed_slice(); reader.read_exact(&mut bitmap)?; Some(bitmap) } else { None }; - if size > u16::MAX as usize + 1 { - return Err(io::Error::other("size is greater than supported")); - } - - // Read the container descriptions - let mut descriptions = vec![[0; 2]; size]; - reader.read_exact(cast_slice_mut(&mut descriptions))?; - descriptions.iter_mut().for_each(|[ref mut key, ref mut len]| { - *key = u16::from_le(*key); - *len = u16::from_le(*len); - }); - - if has_offsets { - let mut offsets = vec![0; size]; - reader.read_exact(cast_slice_mut(&mut offsets))?; - offsets.iter_mut().for_each(|offset| *offset = u32::from_le(*offset)); - return self.intersection_with_serialized_impl_with_offsets( - reader, - a, - b, - &descriptions, - &offsets, - run_container_bitmap.as_deref(), - ); - } + let mut descriptions = vec![[0; 2]; size].into_boxed_slice(); + reader.read_u16_into::(cast_slice_mut(descriptions.as_mut()))?; - // Read each container and skip the useless ones - let mut containers = Vec::new(); - for (i, &[key, len_minus_one]) in descriptions.iter().enumerate() { - let container = match self.containers.binary_search_by_key(&key, |c| c.key) { - Ok(index) => self.containers.get(index), - Err(_) => None, - }; - let cardinality = u64::from(len_minus_one) + 1; - - // If the run container bitmap is present, check if this container is a run container - let is_run_container = - run_container_bitmap.as_ref().is_some_and(|bm| bm[i / 8] & (1 << (i % 8)) != 0); - - let store = if is_run_container { - let runs = reader.read_u16::()?; - match container { - Some(_) => { - let mut intervals = vec![[0, 0]; runs as usize]; - reader.read_exact(cast_slice_mut(&mut intervals))?; - intervals.iter_mut().for_each(|[s, len]| { - *s = u16::from_le(*s); - *len = u16::from_le(*len); - }); - - let cardinality = intervals.iter().map(|[_, len]| *len as usize).sum(); - let mut store = Store::with_capacity(cardinality); - intervals.into_iter().try_for_each( - |[s, len]| -> Result<(), io::ErrorKind> { - let end = s.checked_add(len).ok_or(io::ErrorKind::InvalidData)?; - store.insert_range(RangeInclusive::new(s, end)); - Ok(()) - }, - )?; - store - } - None => { - let runs_size = mem::size_of::() * 2 * runs as usize; - reader.seek(SeekFrom::Current(runs_size as i64))?; - continue; - } - } - } else if cardinality <= ARRAY_LIMIT { - match container { - Some(_) => { - let mut values = vec![0; cardinality as usize]; - reader.read_exact(cast_slice_mut(&mut values))?; - values.iter_mut().for_each(|n| *n = u16::from_le(*n)); - let array = - a(values).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; - Store::Array(array) - } - None => { - let array_size = mem::size_of::() * cardinality as usize; - reader.seek(SeekFrom::Current(array_size as i64))?; - continue; - } - } - } else { - match container { - Some(_) => { - let mut values = Box::new([0; BITMAP_LENGTH]); - reader.read_exact(cast_slice_mut(&mut values[..]))?; - values.iter_mut().for_each(|n| *n = u64::from_le(*n)); - let bitmap = b(cardinality, values) - .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; - Store::Bitmap(bitmap) - } - None => { - let bitmap_size = mem::size_of::() * BITMAP_LENGTH; - reader.seek(SeekFrom::Current(bitmap_size as i64))?; - continue; - } - } - }; - - if let Some(container) = container { - let mut other_container = Container { key, store }; - other_container &= container; - if !other_container.is_empty() { - containers.push(other_container); - } - } - } + let offsets = if has_offsets { + let mut offsets = vec![0u32; size].into_boxed_slice(); + reader.read_u32_into::(offsets.as_mut())?; + Some(offsets) + } else { + None + }; - Ok(RoaringBitmap { containers }) + Ok(BitmapReader { base_offset, descriptions, offsets, run_container_bitmap }) } - fn intersection_with_serialized_impl_with_offsets( - &self, - mut reader: R, - a: A, - b: B, - descriptions: &[[u16; 2]], - offsets: &[u32], - run_container_bitmap: Option<&[u8]>, - ) -> io::Result - where - R: io::Read + io::Seek, - A: Fn(Vec) -> Result, - AErr: Error + Send + Sync + 'static, - B: Fn(u64, Box<[u64; 1024]>) -> Result, - BErr: Error + Send + Sync + 'static, - { - let mut containers = Vec::new(); - for container in &self.containers { - let i = match descriptions.binary_search_by_key(&container.key, |[k, _]| *k) { - Ok(index) => index, - Err(_) => continue, - }; - - // Seek to the bytes of the container we want. - reader.seek(SeekFrom::Start(offsets[i] as u64))?; - - let [key, len_minus_one] = descriptions[i]; - let cardinality = u64::from(len_minus_one) + 1; - - // If the run container bitmap is present, check if this container is a run container - let is_run_container = - run_container_bitmap.as_ref().is_some_and(|bm| bm[i / 8] & (1 << (i % 8)) != 0); - - let store = if is_run_container { - let runs = reader.read_u16::().unwrap(); - let mut intervals = vec![[0, 0]; runs as usize]; - reader.read_exact(cast_slice_mut(&mut intervals)).unwrap(); - intervals.iter_mut().for_each(|[s, len]| { - *s = u16::from_le(*s); - *len = u16::from_le(*len); - }); - - let cardinality = intervals.iter().map(|[_, len]| *len as usize).sum(); - let mut store = Store::with_capacity(cardinality); - intervals.into_iter().try_for_each(|[s, len]| -> Result<(), io::ErrorKind> { - let end = s.checked_add(len).ok_or(io::ErrorKind::InvalidData)?; - store.insert_range(RangeInclusive::new(s, end)); - Ok(()) - })?; - store - } else if cardinality <= ARRAY_LIMIT { - let mut values = vec![0; cardinality as usize]; - reader.read_exact(cast_slice_mut(&mut values)).unwrap(); - values.iter_mut().for_each(|n| *n = u16::from_le(*n)); - let array = a(values).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; - Store::Array(array) - } else { - let mut values = Box::new([0; BITMAP_LENGTH]); - reader.read_exact(cast_slice_mut(&mut values[..])).unwrap(); - values.iter_mut().for_each(|n| *n = u64::from_le(*n)); - let bitmap = b(cardinality, values) - .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; - Store::Bitmap(bitmap) - }; - - let mut other_container = Container { key, store }; - other_container &= container; - if !other_container.is_empty() { - containers.push(other_container); - } - } - - Ok(RoaringBitmap { containers }) + pub fn is_run_container(&self, index: usize) -> bool { + self.run_container_bitmap.as_ref().is_some_and(|bm| bm[index / 8] & (1 << (index % 8)) != 0) } } @@ -297,4 +405,18 @@ mod test { prop_assert_eq!(a.intersection_with_serialized_unchecked(Cursor::new(serialized_bytes_b)).unwrap(), a & b); } } + + proptest! { + #[test] + fn union_with_serialized_eq_materialized_intersection( + a in RoaringBitmap::arbitrary(), + b in RoaringBitmap::arbitrary() + ) { + let mut serialized_bytes_b = Vec::new(); + b.serialize_into(&mut serialized_bytes_b).unwrap(); + let serialized_bytes_b = &serialized_bytes_b[..]; + + prop_assert_eq!(a.union_with_serialized_unchecked(Cursor::new(serialized_bytes_b)).unwrap(), a | b); + } + } } From feb819d315cd2249fbc55e63a621e7ed104b6579 Mon Sep 17 00:00:00 2001 From: coldWater Date: Fri, 16 Jan 2026 11:38:20 +0800 Subject: [PATCH 3/6] bench --- benchmarks/benches/lib.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/benchmarks/benches/lib.rs b/benchmarks/benches/lib.rs index d4908c864..b05b9b195 100644 --- a/benchmarks/benches/lib.rs +++ b/benchmarks/benches/lib.rs @@ -599,6 +599,12 @@ fn intersection_with_serialized(c: &mut Criterion) { }) } +fn union_with_serialized(c: &mut Criterion) { + pairwise_ops_with_serialized(c, "union_with_serialized_unchecked", |a, b| { + a.union_with_serialized_unchecked(Cursor::new(b)).unwrap() + }) +} + // LEGACY BENCHMARKS // ================= @@ -784,5 +790,6 @@ criterion_group!( successive_and, successive_or, intersection_with_serialized, + union_with_serialized ); criterion_main!(benches); From 95399a7831534a2949ae74775ee12fc341652fad Mon Sep 17 00:00:00 2001 From: coldWater Date: Wed, 4 Mar 2026 10:47:06 +0800 Subject: [PATCH 4/6] revert intersection_with_serialized_unchecked --- roaring/src/bitmap/ops_with_serialized.rs | 258 +++++++++++++++++++--- 1 file changed, 229 insertions(+), 29 deletions(-) diff --git a/roaring/src/bitmap/ops_with_serialized.rs b/roaring/src/bitmap/ops_with_serialized.rs index d560eba14..c1ca92869 100644 --- a/roaring/src/bitmap/ops_with_serialized.rs +++ b/roaring/src/bitmap/ops_with_serialized.rs @@ -1,9 +1,12 @@ use bytemuck::cast_slice_mut; use byteorder::{LittleEndian, ReadBytesExt}; +use core::convert::Infallible; use std::cmp::Ordering; +use std::error::Error; use std::io::{self, SeekFrom}; use std::mem; +use std::ops::RangeInclusive; use crate::bitmap::container::Container; use crate::bitmap::serialization::{ @@ -40,20 +43,238 @@ impl RoaringBitmap { /// rb1 & rb2, /// ); /// ``` - pub fn intersection_with_serialized_unchecked( + pub fn intersection_with_serialized_unchecked(&self, other: R) -> io::Result + where + R: io::Read + io::Seek, + { + RoaringBitmap::intersection_with_serialized_impl::( + self, + other, + |values| Ok(ArrayStore::from_vec_unchecked(values)), + |len, values| Ok(BitmapStore::from_unchecked(len, values)), + ) + } + + fn intersection_with_serialized_impl( &self, - mut other: R, + mut reader: R, + a: A, + b: B, ) -> io::Result where R: io::Read + io::Seek, + A: Fn(Vec) -> Result, + AErr: Error + Send + Sync + 'static, + B: Fn(u64, Box<[u64; 1024]>) -> Result, + BErr: Error + Send + Sync + 'static, { - let metadata = BitmapReader::decode(&mut other)?; - let containers = Visitor { - containers: &self.containers, - metadata: &metadata, - handler: &mut BitAndHandler, + // First read the cookie to determine which version of the format we are reading + let (size, has_offsets, has_run_containers) = { + let cookie = reader.read_u32::()?; + if cookie == SERIAL_COOKIE_NO_RUNCONTAINER { + (reader.read_u32::()? as usize, true, false) + } else if (cookie as u16) == SERIAL_COOKIE { + let size = ((cookie >> 16) + 1) as usize; + (size, size >= NO_OFFSET_THRESHOLD, true) + } else { + return Err(io::Error::other("unknown cookie value")); + } + }; + + // Read the run container bitmap if necessary + let run_container_bitmap = if has_run_containers { + let mut bitmap = vec![0u8; size.div_ceil(8)]; + reader.read_exact(&mut bitmap)?; + Some(bitmap) + } else { + None + }; + + if size > u16::MAX as usize + 1 { + return Err(io::Error::other("size is greater than supported")); } - .visit(&mut other)?; + + // Read the container descriptions + let mut descriptions = vec![[0; 2]; size]; + reader.read_exact(cast_slice_mut(&mut descriptions))?; + descriptions.iter_mut().for_each(|[ref mut key, ref mut len]| { + *key = u16::from_le(*key); + *len = u16::from_le(*len); + }); + + if has_offsets { + let mut offsets = vec![0; size]; + reader.read_exact(cast_slice_mut(&mut offsets))?; + offsets.iter_mut().for_each(|offset| *offset = u32::from_le(*offset)); + return self.intersection_with_serialized_impl_with_offsets( + reader, + a, + b, + &descriptions, + &offsets, + run_container_bitmap.as_deref(), + ); + } + + // Read each container and skip the useless ones + let mut containers = Vec::new(); + for (i, &[key, len_minus_one]) in descriptions.iter().enumerate() { + let container = match self.containers.binary_search_by_key(&key, |c| c.key) { + Ok(index) => self.containers.get(index), + Err(_) => None, + }; + let cardinality = u64::from(len_minus_one) + 1; + + // If the run container bitmap is present, check if this container is a run container + let is_run_container = + run_container_bitmap.as_ref().is_some_and(|bm| bm[i / 8] & (1 << (i % 8)) != 0); + + let store = if is_run_container { + let runs = reader.read_u16::()?; + match container { + Some(_) => { + let mut intervals = vec![[0, 0]; runs as usize]; + reader.read_exact(cast_slice_mut(&mut intervals))?; + intervals.iter_mut().for_each(|[s, len]| { + *s = u16::from_le(*s); + *len = u16::from_le(*len); + }); + + let cardinality = intervals.iter().map(|[_, len]| *len as usize).sum(); + let mut store = Store::with_capacity(cardinality); + intervals.into_iter().try_for_each( + |[s, len]| -> Result<(), io::ErrorKind> { + let end = s.checked_add(len).ok_or(io::ErrorKind::InvalidData)?; + store.insert_range(RangeInclusive::new(s, end)); + Ok(()) + }, + )?; + store + } + None => { + let runs_size = mem::size_of::() * 2 * runs as usize; + reader.seek(SeekFrom::Current(runs_size as i64))?; + continue; + } + } + } else if cardinality <= ARRAY_LIMIT { + match container { + Some(_) => { + let mut values = vec![0; cardinality as usize]; + reader.read_exact(cast_slice_mut(&mut values))?; + values.iter_mut().for_each(|n| *n = u16::from_le(*n)); + let array = + a(values).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; + Store::Array(array) + } + None => { + let array_size = mem::size_of::() * cardinality as usize; + reader.seek(SeekFrom::Current(array_size as i64))?; + continue; + } + } + } else { + match container { + Some(_) => { + let mut values = Box::new([0; BITMAP_LENGTH]); + reader.read_exact(cast_slice_mut(&mut values[..]))?; + values.iter_mut().for_each(|n| *n = u64::from_le(*n)); + let bitmap = b(cardinality, values) + .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; + Store::Bitmap(bitmap) + } + None => { + let bitmap_size = mem::size_of::() * BITMAP_LENGTH; + reader.seek(SeekFrom::Current(bitmap_size as i64))?; + continue; + } + } + }; + + if let Some(container) = container { + let mut other_container = Container { key, store }; + other_container &= container; + if !other_container.is_empty() { + containers.push(other_container); + } + } + } + + Ok(RoaringBitmap { containers }) + } + + fn intersection_with_serialized_impl_with_offsets( + &self, + mut reader: R, + a: A, + b: B, + descriptions: &[[u16; 2]], + offsets: &[u32], + run_container_bitmap: Option<&[u8]>, + ) -> io::Result + where + R: io::Read + io::Seek, + A: Fn(Vec) -> Result, + AErr: Error + Send + Sync + 'static, + B: Fn(u64, Box<[u64; 1024]>) -> Result, + BErr: Error + Send + Sync + 'static, + { + let mut containers = Vec::new(); + for container in &self.containers { + let i = match descriptions.binary_search_by_key(&container.key, |[k, _]| *k) { + Ok(index) => index, + Err(_) => continue, + }; + + // Seek to the bytes of the container we want. + reader.seek(SeekFrom::Start(offsets[i] as u64))?; + + let [key, len_minus_one] = descriptions[i]; + let cardinality = u64::from(len_minus_one) + 1; + + // If the run container bitmap is present, check if this container is a run container + let is_run_container = + run_container_bitmap.as_ref().is_some_and(|bm| bm[i / 8] & (1 << (i % 8)) != 0); + + let store = if is_run_container { + let runs = reader.read_u16::().unwrap(); + let mut intervals = vec![[0, 0]; runs as usize]; + reader.read_exact(cast_slice_mut(&mut intervals)).unwrap(); + intervals.iter_mut().for_each(|[s, len]| { + *s = u16::from_le(*s); + *len = u16::from_le(*len); + }); + + let cardinality = intervals.iter().map(|[_, len]| *len as usize).sum(); + let mut store = Store::with_capacity(cardinality); + intervals.into_iter().try_for_each(|[s, len]| -> Result<(), io::ErrorKind> { + let end = s.checked_add(len).ok_or(io::ErrorKind::InvalidData)?; + store.insert_range(RangeInclusive::new(s, end)); + Ok(()) + })?; + store + } else if cardinality <= ARRAY_LIMIT { + let mut values = vec![0; cardinality as usize]; + reader.read_exact(cast_slice_mut(&mut values)).unwrap(); + values.iter_mut().for_each(|n| *n = u16::from_le(*n)); + let array = a(values).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; + Store::Array(array) + } else { + let mut values = Box::new([0; BITMAP_LENGTH]); + reader.read_exact(cast_slice_mut(&mut values[..])).unwrap(); + values.iter_mut().for_each(|n| *n = u64::from_le(*n)); + let bitmap = b(cardinality, values) + .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; + Store::Bitmap(bitmap) + }; + + let mut other_container = Container { key, store }; + other_container &= container; + if !other_container.is_empty() { + containers.push(other_container); + } + } + Ok(RoaringBitmap { containers }) } @@ -280,27 +501,6 @@ trait VisitorHandler { ) -> io::Result>; } -struct BitAndHandler; - -impl VisitorHandler for BitAndHandler { - fn handle_left_only(&mut self, _container: &Container) -> io::Result> { - Ok(None) - } - - fn handel_matched( - &mut self, - left: &Container, - mut right: Container, - ) -> io::Result> { - right &= left; - if right.is_empty() { - Ok(None) - } else { - Ok(Some(right)) - } - } -} - struct BitOrHandler; impl VisitorHandler for BitOrHandler { From 699fb6de44a884631042e99dc84d273850298291 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 4 Mar 2026 09:50:03 +0100 Subject: [PATCH 5/6] Make clippy happy --- roaring/src/bitmap/inherent.rs | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/roaring/src/bitmap/inherent.rs b/roaring/src/bitmap/inherent.rs index cbaa6b5da..c419adc75 100644 --- a/roaring/src/bitmap/inherent.rs +++ b/roaring/src/bitmap/inherent.rs @@ -400,15 +400,11 @@ impl RoaringBitmap { pub fn remove(&mut self, value: u32) -> bool { let (key, index) = util::split(value); match self.containers.binary_search_by_key(&key, |c| c.key) { - Ok(loc) => { - if self.containers[loc].remove(index) { - if self.containers[loc].is_empty() { - self.containers.remove(loc); - } - true - } else { - false + Ok(loc) if self.containers[loc].remove(index) => { + if self.containers[loc].is_empty() { + self.containers.remove(loc); } + true } _ => false, } From cca48d1e411134e5a28522285029151d0bc6ae32 Mon Sep 17 00:00:00 2001 From: coldWater Date: Wed, 4 Mar 2026 17:31:32 +0800 Subject: [PATCH 6/6] update --- roaring/src/bitmap/ops_with_serialized.rs | 414 +++++++++------------- 1 file changed, 166 insertions(+), 248 deletions(-) diff --git a/roaring/src/bitmap/ops_with_serialized.rs b/roaring/src/bitmap/ops_with_serialized.rs index c1ca92869..290e850db 100644 --- a/roaring/src/bitmap/ops_with_serialized.rs +++ b/roaring/src/bitmap/ops_with_serialized.rs @@ -2,7 +2,6 @@ use bytemuck::cast_slice_mut; use byteorder::{LittleEndian, ReadBytesExt}; use core::convert::Infallible; -use std::cmp::Ordering; use std::error::Error; use std::io::{self, SeekFrom}; use std::mem; @@ -303,285 +302,204 @@ impl RoaringBitmap { /// rb1 | rb2, /// ); /// ``` - pub fn union_with_serialized_unchecked(&self, mut other: R) -> io::Result + pub fn union_with_serialized_unchecked(&self, other: R) -> io::Result where R: io::Read + io::Seek, { - let metadata = BitmapReader::decode(&mut other)?; - let containers = Visitor { - containers: &self.containers, - metadata: &metadata, - handler: &mut BitOrHandler, - } - .visit(&mut other)?; - Ok(RoaringBitmap { containers }) - } -} - -struct Visitor<'a, H> { - containers: &'a [Container], - metadata: &'a BitmapReader, - handler: &'a mut H, -} - -impl Visitor<'_, H> -where - H: VisitorHandler, -{ - fn visit(&mut self, reader: &mut R) -> io::Result> - where - R: io::Read + io::Seek, - { - let mut result = Vec::new(); - let mut descriptions = self - .metadata - .descriptions - .iter() - .enumerate() - .map(|(i, &[key, len_minus_one])| MetaItem { - key, - cardinality: len_minus_one as u32 + 1, - is_run: self.metadata.is_run_container(i), - offset: self.metadata.offsets.as_ref().map(|offsets| offsets[i]), - }) - .peekable(); - let mut containers = self.containers.iter().peekable(); - - loop { - match (containers.peek(), descriptions.peek()) { - (Some(container), Some(item)) => match item.key.cmp(&container.key) { - Ordering::Equal => { - result.extend(self.consume_matched(reader, container, item)?); - descriptions.next(); - containers.next(); - } - Ordering::Less => { - result.extend(self.consume_right(reader, item)?); - descriptions.next(); - } - Ordering::Greater => { - result.extend(self.consume_left(container)?); - containers.next(); - } - }, - (None, Some(item)) => { - result.extend(self.consume_right(reader, item)?); - descriptions.next(); - } - (Some(container), None) => { - result.extend(self.consume_left(container)?); - containers.next(); - } - (None, None) => { - return Ok(result); - } - } - } - } - - fn consume_left(&mut self, container: &Container) -> io::Result> { - self.handler.handle_left_only(container) - } - - fn consume_right(&mut self, reader: &mut R, item: &MetaItem) -> io::Result> - where - R: io::Read + io::Seek, - { - if self.handler.need_handle_right_only(item.key) { - let container = item.load_container(reader)?; - self.handler.handle_right_only(container) - } else if item.offset.is_some() { - Ok(None) - } else { - item.skip(reader)?; - Ok(None) - } + RoaringBitmap::union_with_serialized_impl::( + self, + other, + |values| Ok(ArrayStore::from_vec_unchecked(values)), + |len, values| Ok(BitmapStore::from_unchecked(len, values)), + ) } - fn consume_matched( - &mut self, - reader: &mut R, - left: &Container, - item: &MetaItem, - ) -> io::Result> + fn union_with_serialized_impl( + &self, + mut reader: R, + a: A, + b: B, + ) -> io::Result where R: io::Read + io::Seek, + A: Fn(Vec) -> Result, + AErr: Error + Send + Sync + 'static, + B: Fn(u64, Box<[u64; 1024]>) -> Result, + BErr: Error + Send + Sync + 'static, { - if !self.handler.need_handle_matched(left) { - if item.offset.is_none() { - item.skip(reader)?; + let (size, has_offsets, has_run_containers) = { + let cookie = reader.read_u32::()?; + if cookie == SERIAL_COOKIE_NO_RUNCONTAINER { + (reader.read_u32::()? as usize, true, false) + } else if (cookie as u16) == SERIAL_COOKIE { + let size = ((cookie >> 16) + 1) as usize; + (size, size >= NO_OFFSET_THRESHOLD, true) + } else { + return Err(io::Error::other("unknown cookie value")); } - return Ok(None); - } - - if let Some(offset) = item.offset { - let absolute_offset = self - .metadata - .base_offset - .checked_add(offset as u64) - .ok_or_else(|| io::Error::other("offset overflow"))?; - reader.seek(SeekFrom::Start(absolute_offset))?; - } - let right = item.load_container(reader)?; - self.handler.handel_matched(left, right) - } -} - -struct MetaItem { - key: u16, - cardinality: u32, - is_run: bool, - offset: Option, -} - -impl MetaItem { - fn load_container(&self, reader: &mut R) -> io::Result { - let store = if self.is_run { - let runs = reader.read_u16::()?; - let mut intervals = vec![[0_u16, 0]; runs as usize]; - reader.read_u16_into::(cast_slice_mut(&mut intervals))?; - - let cardinality = intervals.iter().map(|[_, len]| *len as usize).sum(); - let mut store = Store::with_capacity(cardinality); + }; - for [s, len] in intervals { - let end = s.checked_add(len).ok_or(io::ErrorKind::InvalidData)?; - store.insert_range(s..=end); - } - store - } else if self.cardinality as u64 <= ARRAY_LIMIT { - let mut values = vec![0; self.cardinality as usize]; - reader.read_u16_into::(&mut values)?; - let array = ArrayStore::from_vec_unchecked(values); - Store::Array(array) + let run_container_bitmap = if has_run_containers { + let mut bitmap = vec![0u8; size.div_ceil(8)]; + reader.read_exact(&mut bitmap)?; + Some(bitmap) } else { - let mut values = Box::new([0; BITMAP_LENGTH]); - reader.read_u64_into::(values.as_mut_slice())?; - let bitmap = BitmapStore::from_unchecked(self.cardinality as u64, values); - Store::Bitmap(bitmap) + None }; - Ok(Container { key: self.key, store }) - } - fn skip(&self, reader: &mut R) -> io::Result<()> { - if self.is_run { - let runs = reader.read_u16::()?; - let runs_size = mem::size_of::() * 2 * runs as usize; - reader.seek_relative(runs_size as i64)?; - } else if self.cardinality as u64 <= ARRAY_LIMIT { - let array_size = mem::size_of::() * self.cardinality as usize; - reader.seek_relative(array_size as i64)?; - } else { - let bitmap_size = mem::size_of::() * BITMAP_LENGTH; - reader.seek_relative(bitmap_size as i64)?; + if size > u16::MAX as usize + 1 { + return Err(io::Error::other("size is greater than supported")); } - Ok(()) - } -} -trait VisitorHandler { - fn handle_left_only(&mut self, container: &Container) -> io::Result>; + let mut descriptions = vec![[0; 2]; size]; + reader.read_exact(cast_slice_mut(&mut descriptions))?; + descriptions.iter_mut().for_each(|[key, len]| { + *key = u16::from_le(*key); + *len = u16::from_le(*len); + }); - fn need_handle_right_only(&mut self, _key: u16) -> bool { - false - } + if has_offsets { + let mut offsets = vec![0; size]; + reader.read_exact(cast_slice_mut(&mut offsets))?; + offsets.iter_mut().for_each(|offset| *offset = u32::from_le(*offset)); + return self.union_with_serialized_impl_with_offsets( + reader, + a, + b, + &descriptions, + &offsets, + run_container_bitmap.as_deref(), + ); + } - fn handle_right_only(&mut self, _container: Container) -> io::Result> { - unreachable!() - } + let mut containers = Vec::new(); + let mut left_containers = self.containers.iter().peekable(); + for (i, &[key, len_minus_one]) in descriptions.iter().enumerate() { + while left_containers.peek().is_some_and(|container| container.key < key) { + containers.push(left_containers.next().unwrap().clone()); + } - fn need_handle_matched(&mut self, _container: &Container) -> bool { - true - } + let cardinality = u64::from(len_minus_one) + 1; + let is_run_container = + run_container_bitmap.as_ref().is_some_and(|bm| bm[i / 8] & (1 << (i % 8)) != 0); - fn handel_matched( - &mut self, - left: &Container, - right: Container, - ) -> io::Result>; -} + let store = if is_run_container { + let runs = reader.read_u16::()?; + let mut intervals = vec![[0, 0]; runs as usize]; + reader.read_exact(cast_slice_mut(&mut intervals))?; + intervals.iter_mut().for_each(|[s, len]| { + *s = u16::from_le(*s); + *len = u16::from_le(*len); + }); -struct BitOrHandler; + let cardinality = intervals.iter().map(|[_, len]| *len as usize).sum(); + let mut store = Store::with_capacity(cardinality); + intervals.into_iter().try_for_each(|[s, len]| -> Result<(), io::ErrorKind> { + let end = s.checked_add(len).ok_or(io::ErrorKind::InvalidData)?; + store.insert_range(RangeInclusive::new(s, end)); + Ok(()) + })?; + store + } else if cardinality <= ARRAY_LIMIT { + let mut values = vec![0; cardinality as usize]; + reader.read_exact(cast_slice_mut(&mut values))?; + values.iter_mut().for_each(|n| *n = u16::from_le(*n)); + let array = a(values).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; + Store::Array(array) + } else { + let mut values = Box::new([0; BITMAP_LENGTH]); + reader.read_exact(cast_slice_mut(&mut values[..]))?; + values.iter_mut().for_each(|n| *n = u64::from_le(*n)); + let bitmap = b(cardinality, values) + .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; + Store::Bitmap(bitmap) + }; -impl VisitorHandler for BitOrHandler { - fn handle_left_only(&mut self, container: &Container) -> io::Result> { - Ok(Some(container.clone())) - } + let mut right_container = Container { key, store }; + if left_containers.peek().is_some_and(|container| container.key == key) { + right_container |= left_containers.next().unwrap(); + } + if !right_container.is_empty() { + containers.push(right_container); + } + } - fn need_handle_right_only(&mut self, _key: u16) -> bool { - true + containers.extend(left_containers.cloned()); + Ok(RoaringBitmap { containers }) } - fn handle_right_only(&mut self, container: Container) -> io::Result> { - Ok(Some(container)) - } + fn union_with_serialized_impl_with_offsets( + &self, + mut reader: R, + a: A, + b: B, + descriptions: &[[u16; 2]], + offsets: &[u32], + run_container_bitmap: Option<&[u8]>, + ) -> io::Result + where + R: io::Read + io::Seek, + A: Fn(Vec) -> Result, + AErr: Error + Send + Sync + 'static, + B: Fn(u64, Box<[u64; 1024]>) -> Result, + BErr: Error + Send + Sync + 'static, + { + let mut containers = Vec::new(); + let mut left_containers = self.containers.iter().peekable(); + for (i, &[key, len_minus_one]) in descriptions.iter().enumerate() { + while left_containers.peek().is_some_and(|container| container.key < key) { + containers.push(left_containers.next().unwrap().clone()); + } - fn handel_matched( - &mut self, - left: &Container, - mut right: Container, - ) -> io::Result> { - right |= left; - if right.is_empty() { - Ok(None) - } else { - Ok(Some(right)) - } - } -} + reader.seek(SeekFrom::Start(offsets[i] as u64))?; -#[derive(Debug, Clone)] -struct BitmapReader { - base_offset: u64, - descriptions: Box<[[u16; 2]]>, - offsets: Option>, - run_container_bitmap: Option>, -} + let cardinality = u64::from(len_minus_one) + 1; + let is_run_container = + run_container_bitmap.as_ref().is_some_and(|bm| bm[i / 8] & (1 << (i % 8)) != 0); -impl BitmapReader { - pub fn decode(reader: &mut R) -> io::Result { - let base_offset = reader.stream_position()?; + let store = if is_run_container { + let runs = reader.read_u16::().unwrap(); + let mut intervals = vec![[0, 0]; runs as usize]; + reader.read_exact(cast_slice_mut(&mut intervals)).unwrap(); + intervals.iter_mut().for_each(|[s, len]| { + *s = u16::from_le(*s); + *len = u16::from_le(*len); + }); - let (size, has_offsets, has_run_containers) = { - let cookie = reader.read_u32::()?; - if cookie == SERIAL_COOKIE_NO_RUNCONTAINER { - (reader.read_u32::()? as usize, true, false) - } else if (cookie as u16) == SERIAL_COOKIE { - let size = (cookie >> 16) as usize + 1; - (size, size >= NO_OFFSET_THRESHOLD, true) + let cardinality = intervals.iter().map(|[_, len]| *len as usize).sum(); + let mut store = Store::with_capacity(cardinality); + intervals.into_iter().try_for_each(|[s, len]| -> Result<(), io::ErrorKind> { + let end = s.checked_add(len).ok_or(io::ErrorKind::InvalidData)?; + store.insert_range(RangeInclusive::new(s, end)); + Ok(()) + })?; + store + } else if cardinality <= ARRAY_LIMIT { + let mut values = vec![0; cardinality as usize]; + reader.read_exact(cast_slice_mut(&mut values)).unwrap(); + values.iter_mut().for_each(|n| *n = u16::from_le(*n)); + let array = a(values).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; + Store::Array(array) } else { - return Err(io::Error::other("unknown cookie value")); - } - }; + let mut values = Box::new([0; BITMAP_LENGTH]); + reader.read_exact(cast_slice_mut(&mut values[..])).unwrap(); + values.iter_mut().for_each(|n| *n = u64::from_le(*n)); + let bitmap = b(cardinality, values) + .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; + Store::Bitmap(bitmap) + }; - if size > u16::MAX as usize + 1 { - return Err(io::Error::other("size is greater than supported")); + let mut right_container = Container { key, store }; + if left_containers.peek().is_some_and(|container| container.key == key) { + right_container |= left_containers.next().unwrap(); + } + if !right_container.is_empty() { + containers.push(right_container); + } } - let run_container_bitmap = if has_run_containers { - let mut bitmap = vec![0u8; size.div_ceil(8)].into_boxed_slice(); - reader.read_exact(&mut bitmap)?; - Some(bitmap) - } else { - None - }; - - let mut descriptions = vec![[0; 2]; size].into_boxed_slice(); - reader.read_u16_into::(cast_slice_mut(descriptions.as_mut()))?; - - let offsets = if has_offsets { - let mut offsets = vec![0u32; size].into_boxed_slice(); - reader.read_u32_into::(offsets.as_mut())?; - Some(offsets) - } else { - None - }; - - Ok(BitmapReader { base_offset, descriptions, offsets, run_container_bitmap }) - } - - pub fn is_run_container(&self, index: usize) -> bool { - self.run_container_bitmap.as_ref().is_some_and(|bm| bm[index / 8] & (1 << (index % 8)) != 0) + containers.extend(left_containers.cloned()); + Ok(RoaringBitmap { containers }) } }